# Creating a dataset to train a model for combined recognition of whether a voice belongs to a person or not and generated voices

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import List

In [3]:
original = pd.read_csv('../data/validated.tsv',  sep='\t')
generated = pd.read_csv('../data/generated.tsv',  sep='\t')

In [4]:
sample_count = original['client_id'].value_counts()
train_client_ids = [e for e in sample_count[sample_count < 500][sample_count > 5].index]

In [5]:
def filter_sample_by_clients_ids(data: pd.DataFrame, clients_ids: List[str]):
	return data[data['client_id'].isin(clients_ids)]

In [6]:
train_original_sample = filter_sample_by_clients_ids(original, train_client_ids)
train_original_sample = train_original_sample[['client_id', 'path']]

In [7]:
train_generated_sample = filter_sample_by_clients_ids(generated, train_client_ids)
train_generated_sample = train_generated_sample[['client_id', 'generated_path']]

In [8]:
train_generated_sample = train_generated_sample.rename(columns={"generated_path": "path"})

In [9]:
def concatenation(train_data: pd.DataFrame, anchor: pd.DataFrame, posneg: pd.DataFrame) -> pd.DataFrame:
		data_particle_case = pd.DataFrame(columns=['anchor_client_id', 'anchor_path', 'posneg_client_id', 'posneg_path', 'anchor_source', 'posneg_source'])
		data_particle_case[['anchor_client_id', 'anchor_path', 'anchor_source']] = anchor
		data_particle_case[['posneg_client_id', 'posneg_path', 'posneg_source']] = posneg
		return pd.concat([train_data, data_particle_case], ignore_index=True)


def generate_combined_dataset(original_data: pd.DataFrame, generated_data: pd.DataFrame, data_frac: int = 0.5) -> pd.DataFrame:
	client_ids = original_data['client_id'].unique()
	original_data['source'] = 'original'
	generated_data['source'] = 'generated'
	
	train_data = pd.DataFrame(columns=['anchor_client_id', 'anchor_path', 'posneg_client_id', 'posneg_path', 'anchor_source', 'posneg_source'])
	for id in client_ids:

		original_id_sample = original_data[original_data['client_id'] == id]
		generated_id_sample = generated_data[generated_data['client_id'] == id]
		original_no_client_id_sample = original_data[original_data['client_id'] != id]

		anchor_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		positive_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		train_data = concatenation(train_data, anchor_original, positive_original)


		anchor_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		negative_original = original_no_client_id_sample.sample(frac=data_frac).reset_index(drop=True)
		train_data = concatenation(train_data, anchor_original, negative_original)

		anchor_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		positive_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		train_data = concatenation(train_data, anchor_original, positive_original)

		anchor_original = original_id_sample.sample(frac=data_frac).reset_index(drop=True)
		negative_generated = generated_id_sample.sample(frac=data_frac).reset_index(drop=True)
		train_data = concatenation(train_data, anchor_original, negative_generated)
	
	return train_data

In [14]:
save_folder = '../dataset/'
dataset_name = 'train_combined_dataset.csv'
save_path = save_folder + dataset_name

In [15]:
train_combined_dataset = generate_combined_dataset(train_original_sample, train_generated_sample, data_frac=1)
train_combined_dataset.to_csv(save_path, index=False)

In [16]:
test_client_ids = [e for e in sample_count[sample_count < 5].index]

In [17]:
test_original_sample = filter_sample_by_clients_ids(original, test_client_ids)
test_original_sample = test_original_sample[['client_id', 'path']]

In [18]:
test_generated_sample = filter_sample_by_clients_ids(generated, test_client_ids)
test_generated_sample = test_generated_sample[['client_id', 'generated_path']]
test_generated_sample = test_generated_sample.rename(columns={"generated_path": "path"})

In [21]:
dataset_name = 'test_combined_dataset.csv'
save_path = save_folder + dataset_name

In [22]:
test_combined_dataset = generate_combined_dataset(test_original_sample, test_generated_sample, data_frac=1)
test_combined_dataset.to_csv(save_path, index=False)