# Сreating a dataset to train the model to recognize the generated voices

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
validated = pd.read_csv('../data/validated.tsv',  sep='\t')
generated = pd.read_csv('../data/generated.tsv',  sep='\t')

In [3]:
sample_count = validated['client_id'].value_counts()

In [4]:
valid_client_ids = [e for e in sample_count[sample_count < 500][sample_count >= 5].index]

In [49]:
clean_validated_data = validated[validated['client_id'].isin(valid_client_ids)][['client_id', 'path']]
clean_generated_data = generated[generated['client_id'].isin(valid_client_ids)][['client_id', 'generated_path']]

In [50]:
clean_generated_data = clean_generated_data.rename(columns={"generated_path": "path"})

In [11]:
def concatenation(res_data: pd.DataFrame, anchor: pd.DataFrame, posneg: pd.DataFrame) -> pd.DataFrame:
		data_particle_case = pd.DataFrame(columns=['anchor_client_id', 'anchor_path', 'posneg_client_id', 'posneg_path', 'anchor_source', 'posneg_source'])
		data_particle_case[['anchor_client_id', 'anchor_path', 'anchor_source']] = anchor
		data_particle_case[['posneg_client_id', 'posneg_path', 'posneg_source']] = posneg
		return pd.concat([res_data, data_particle_case], ignore_index=True)

def data_generation(original_data, generated_data, data_frac=1):
	client_ids = original_data['client_id'].unique()
	original_data['source'] = 'original'
	generated_data['source'] = 'generated'
	
	res_data = pd.DataFrame(columns=['anchor_client_id', 'anchor_path', 'posneg_client_id', 'posneg_path', 'anchor_source', 'posneg_source'])
	for id in client_ids:
		original_id_sample = original_data[original_data['client_id'] == id]
		generated_id_sample = generated_data[generated_data['client_id'] == id]

		anchor_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		positive_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		res_data = concatenation(res_data, anchor_original, positive_original)

		anchor_for_generated = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		posneg_generated = generated_id_sample.sample(frac=data_frac).reset_index(drop=True)
		res_data = concatenation(res_data, anchor_for_generated, posneg_generated)
	res_data['label'] = res_data['anchor_source'] == res_data['posneg_source']
	return res_data

In [52]:
clean_train_data = data_generation(clean_validated_data, clean_generated_data)

In [5]:
save_folder = '../dataset/'
dataset_name = 'train_generated_dataset.csv'
save_path = save_folder + dataset_name

In [54]:
clean_train_data.to_csv(save_path, index=False)

In [8]:
test_client_ids = [e for e in sample_count[sample_count < 4][sample_count >= 2].index]

In [9]:
test_clean_validated_data = validated[validated['client_id'].isin(test_client_ids)][['client_id', 'path']].reset_index(drop=True)
test_clean_generated_data = generated[generated['client_id'].isin(test_client_ids)][['client_id', 'generated_path']].reset_index(drop=True).rename(columns={"generated_path": "path"})

In [12]:
clean_test_data = data_generation(test_clean_validated_data, test_clean_generated_data, data_frac=1)

In [13]:
dataset_name = 'test_generated_dataset_.csv'
save_path = save_folder + dataset_name

In [14]:
clean_test_data.to_csv(save_path, index=False)