# 01. Data Split

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [83]:
rnd_nbr = 42
prop_volumen = 0.7

## Read Data

In [84]:
db = pd.read_excel("../data/raw/archivo_combinado.xlsx")

### Preprocesamiento

In [85]:
db = db[db.Kinship == "Paternidad"]

In [86]:
uniqueness_features = ["FAM_Sample_info", "VIC_Sample_info", "Software", "Datos"]
db.drop_duplicates(subset =  uniqueness_features, inplace = True)

In [87]:
# Split compund columns
db[['FAM_Sample_info_id', 'FAM_Sample_info_rel']] = db['FAM_Sample_info'].str.split(' ', n = 1, expand = True)
db[['VIC_Sample_info_id', 'VIC_Sample_info_rel']] = db['VIC_Sample_info'].str.split(' ', n = 1, expand = True)
db.drop(['FAM_Sample_info', 'VIC_Sample_info'], axis = 1, inplace = True)

In [88]:
# Eliminar los que no cumplan con la aritmética
db = db[db.k0 + db.k1 + db.k2 == db.Marcadores]

### Aplicar algunas transformaciones

In [89]:
db["True_positive"] = np.where(db["True_positive"] == False, 0, 1)

In [90]:
db["ValueLog10"] = np.log10(db["Value"])

In [91]:
db.head(5)

Unnamed: 0,Shared_Markers,k0,k1,k2,Share_allele,Value,True_positive,Marcadores,Software,Datos,Kinship,File,Hoja,Sheet,Value_range,FAM_Sample_info_id,FAM_Sample_info_rel,VIC_Sample_info_id,VIC_Sample_info_rel,ValueLog10
0,15,0,4,11,26,236354.0,0,15,BlindSeach,Simulados,Paternidad,15_BlindSeach_Simulados_Paternidad_allele,ipi-1,1:749,100000-999999,12102,F,65402,F,5.373563
1,15,0,7,8,23,184374.0,0,15,BlindSeach,Simulados,Paternidad,15_BlindSeach_Simulados_Paternidad_allele,ipi-1,1:749,100000-999999,23802,S,99602,F,5.2657
2,15,0,6,9,24,83900.6,0,15,BlindSeach,Simulados,Paternidad,15_BlindSeach_Simulados_Paternidad_allele,ipi-1,1:749,10000-99999,47302,S,62402,F,4.923765
3,15,0,10,5,20,41143.0,0,15,BlindSeach,Simulados,Paternidad,15_BlindSeach_Simulados_Paternidad_allele,ipi-1,1:749,10000-99999,116102,F,116702,F,4.614296
4,15,0,8,7,22,30327.8,0,15,BlindSeach,Simulados,Paternidad,15_BlindSeach_Simulados_Paternidad_allele,ipi-1,1:749,10000-99999,122502,F,83702,S,4.481841


## Partición

Primero se calculan las proporciones para lograr una proporción de 70:30 en volumen entre entrenamiento y prueba.

In [92]:
# Guardar los datos simulados y reales en dataframes distintos...
db_simulados = db[db.Datos == "Simulados"]
db_real = db[db.Datos == "Real"]

In [93]:
counts = db_simulados['True_positive'].value_counts()
train_frac = 1 / (2 * ((1-prop_volumen)/prop_volumen) * (counts[1]/(counts[0] + counts[1])) + 1)
print(f"La proporción ideal para partir es {train_frac}.")

La proporción ideal para partir es 0.8857316980269673.


Una vez hecho el cálculo se hace la partición...

In [94]:
sim_samples = len(db_simulados)
sim_samples_train = int(sim_samples * train_frac)

In [95]:
# Revolver los datos
db_simulados = db_simulados.sample(frac = 1, random_state = rnd_nbr, ignore_index = True)

db_train = db_simulados.iloc[:sim_samples_train, :]
db_test = db_simulados.iloc[sim_samples_train:, :]

## Balanceo de Clases con Undersampling

In [96]:
db_train_true = db_train[db_train.True_positive == True]
db_train_false = db_train[db_train.True_positive == False].sample(n = len(db_train_true))

In [97]:
db_train = pd.concat([db_train_true, db_train_false], axis=0, ignore_index = True)
db_train = db_train.sample(frac = 1, random_state = rnd_nbr, ignore_index = True)

### Save dataframes as csv

In [98]:
base_path = '../data/processed/definitve-two-classes/'

db_real.to_csv(base_path + 'real.csv', index = False)
db_train.to_csv(base_path + 'train.csv', index = False)
db_test.to_csv(base_path + 'test.csv', index = False)