# 01. Data Exploration

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

## Read data

In [43]:
db_orig = pd.read_excel("../data/raw/archivo_combinado.xlsx")

In [48]:
db = db_orig.copy()

In [49]:
# Split compund columns
db[['FAM_Sample_info_id', 'FAM_Sample_info_rel']] = db['FAM_Sample_info'].str.split(' ', n = 1, expand = True)
db[['VIC_Sample_info_id', 'VIC_Sample_info_rel']] = db['VIC_Sample_info'].str.split(' ', n = 1, expand = True)
db.drop(['FAM_Sample_info', 'VIC_Sample_info'], axis = 1, inplace = True)

In [51]:
uniqueness_features = ["FAM_Sample_info_id", "FAM_Sample_info_rel", "VIC_Sample_info_id", "VIC_Sample_info_rel", "Software", "Datos"]
db.drop_duplicates(subset =  uniqueness_features, inplace = True)

In [52]:
db = db.sample(frac = 0.01, random_state = 31).reset_index(drop = True)

In [53]:
db['ValueLog10'] = np.log10(db['Value'])

In [54]:
db["label"] = np.where(db["True_positive"] == False, 0, 1)

In [55]:
scaler = MinMaxScaler()

varsToScale = ['k1', 'k2', 'ValueLog10']
newVarNames = [name + '_scaled' for name in varsToScale]

db[newVarNames] = scaler.fit_transform(db[varsToScale])

In [56]:
db

Unnamed: 0,Shared_Markers,k0,k1,k2,Share_allele,Value,True_positive,Marcadores,Software,Datos,...,Value_range,FAM_Sample_info_id,FAM_Sample_info_rel,VIC_Sample_info_id,VIC_Sample_info_rel,ValueLog10,label,k1_scaled,k2_scaled,ValueLog10_scaled
0,13,2,5,8,21,5.71998,False,15,BlindSearch,Simulados,...,0-99,044502,U,066701,U,0.757395,0,0.384615,0.666667,0.067020
1,13,2,5,8,21,4.08628,False,15,BlindSearch,Real,...,0-99,20100767,M,20100723,M,0.611328,0,0.384615,0.666667,0.054095
2,11,4,7,4,15,1.79274,False,15,BlindSearch,Real,...,0-99,20110675,F,20110558,U,0.253517,0,0.538462,0.333333,0.022433
3,11,4,4,7,18,2.00000,False,15,ScreenMatch,Simulados,...,0-99,057202,U,143402,S,0.301030,0,0.307692,0.583333,0.026637
4,13,2,7,6,19,20.95200,False,15,BlindSearch,Simulados,...,0-99,013702,U,078401,U,1.321225,0,0.538462,0.500000,0.116912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4734,10,5,4,6,16,40.00000,False,15,ScreenMatch,Real,...,0-99,20160344,F,20140134,F,1.602060,0,0.307692,0.500000,0.141762
4735,12,3,5,7,19,1.00798,False,15,BlindSearch,Real,...,0-99,20100427,M,20100370,F,0.003452,0,0.384615,0.583333,0.000305
4736,12,3,5,7,19,2.00000,False,15,ScreenMatch,Real,...,0-99,20160191,M,20130089,M,0.301030,0,0.384615,0.583333,0.026637
4737,10,5,6,4,14,1.00000,False,15,ScreenMatch,Real,...,0-99,20120646,S,20200233,M,0.000000,0,0.461538,0.333333,0.000000


In [57]:
fig = px.scatter_3d(db,
                    x = 'k1_scaled',
                    y = 'k2_scaled',
                    z = 'ValueLog10_scaled',
                    color = 'label')
fig.update_traces(marker_size = 5)
fig.show()

## Calcular Tercera Clase

In [58]:
epsilon = 0.5
space_cols = ['k1', 'k2', 'ValueLog10']
new_db = db.copy()

In [59]:
for index1, point_A in db.iterrows():
    current_label = point_A["label"]
    
    db_aux = db.copy()
    db_aux = db_aux.iloc[(index1 + 1):][:]
    db_aux = db_aux[db_aux.label == ((current_label + 1) % 2)]

    for index2, point_B in db_aux.iterrows():
        distance = np.linalg.norm(point_A[space_cols].values - point_B[space_cols].values)
        if distance <= epsilon:
            # print("Ambigüo")
            new_db.at[index1, 'label'] = 0.5
            new_db.at[index2, 'label'] = 0.5

In [60]:
new_db

Unnamed: 0,Shared_Markers,k0,k1,k2,Share_allele,Value,True_positive,Marcadores,Software,Datos,...,Value_range,FAM_Sample_info_id,FAM_Sample_info_rel,VIC_Sample_info_id,VIC_Sample_info_rel,ValueLog10,label,k1_scaled,k2_scaled,ValueLog10_scaled
0,13,2,5,8,21,5.71998,False,15,BlindSearch,Simulados,...,0-99,044502,U,066701,U,0.757395,0.0,0.384615,0.666667,0.067020
1,13,2,5,8,21,4.08628,False,15,BlindSearch,Real,...,0-99,20100767,M,20100723,M,0.611328,0.0,0.384615,0.666667,0.054095
2,11,4,7,4,15,1.79274,False,15,BlindSearch,Real,...,0-99,20110675,F,20110558,U,0.253517,0.0,0.538462,0.333333,0.022433
3,11,4,4,7,18,2.00000,False,15,ScreenMatch,Simulados,...,0-99,057202,U,143402,S,0.301030,0.0,0.307692,0.583333,0.026637
4,13,2,7,6,19,20.95200,False,15,BlindSearch,Simulados,...,0-99,013702,U,078401,U,1.321225,0.0,0.538462,0.500000,0.116912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4734,10,5,4,6,16,40.00000,False,15,ScreenMatch,Real,...,0-99,20160344,F,20140134,F,1.602060,0.0,0.307692,0.500000,0.141762
4735,12,3,5,7,19,1.00798,False,15,BlindSearch,Real,...,0-99,20100427,M,20100370,F,0.003452,0.0,0.384615,0.583333,0.000305
4736,12,3,5,7,19,2.00000,False,15,ScreenMatch,Real,...,0-99,20160191,M,20130089,M,0.301030,0.0,0.384615,0.583333,0.026637
4737,10,5,6,4,14,1.00000,False,15,ScreenMatch,Real,...,0-99,20120646,S,20200233,M,0.000000,0.0,0.461538,0.333333,0.000000


In [61]:
fig = px.scatter_3d(new_db,
                    x = 'k1_scaled',
                    y = 'k2_scaled',
                    z = 'ValueLog10_scaled',
                    color = 'label')
fig.update_traces(marker_size = 5)
fig.show()

# Split Data

In [62]:
train_frac = 0.7

In [63]:
# Guardar los datos simulados y reales en dataframes distintos...
db_simulados = new_db[new_db.Datos == "Simulados"]
db_real = new_db[new_db.Datos == "Real"]

In [64]:
sim_samples = len(db_simulados)
sim_samples_train = int(sim_samples * train_frac)

# Revolver los datos
db_simulados = db_simulados.reset_index(drop = True).sample(frac = 1, random_state = 31, ignore_index = True)

db_train = db_simulados.iloc[:sim_samples_train, :]
db_test = db_simulados.iloc[sim_samples_train:, :]

In [65]:
db_test

Unnamed: 0,Shared_Markers,k0,k1,k2,Share_allele,Value,True_positive,Marcadores,Software,Datos,...,Value_range,FAM_Sample_info_id,FAM_Sample_info_rel,VIC_Sample_info_id,VIC_Sample_info_rel,ValueLog10,label,k1_scaled,k2_scaled,ValueLog10_scaled
706,11,4,4,7,18,1.000000e+00,False,15,ScreenMatch,Simulados,...,0-99,025602,S,026002,U,0.000000,0.0,0.307692,0.583333,0.000000
707,14,1,3,11,25,2.000000e+11,True,15,ScreenMatch,Simulados,...,1000000+,118201,S,118201,U,11.301030,1.0,0.230769,0.916667,1.000000
708,13,2,6,7,20,5.436900e+00,False,15,BlindSearch,Simulados,...,0-99,148301,S,150002,S,0.735351,0.0,0.461538,0.583333,0.065069
709,13,2,6,7,20,1.000000e+00,False,15,ScreenMatch,Simulados,...,0-99,067301,S,079802,S,0.000000,0.0,0.461538,0.583333,0.000000
710,11,4,4,7,18,2.000000e+00,False,15,ScreenMatch,Simulados,...,0-99,031602,S,083602,S,0.301030,0.0,0.307692,0.583333,0.026637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,12,3,2,10,22,4.000000e+00,False,15,ScreenMatch,Simulados,...,0-99,001501,U,110001,S,0.602060,0.0,0.153846,0.833333,0.053275
1005,11,4,6,5,16,3.816410e+00,False,15,BlindSearch,Simulados,...,0-99,133801,S,057902,S,0.581655,0.0,0.461538,0.416667,0.051469
1006,13,2,8,5,18,8.000000e+00,False,15,ScreenMatch,Simulados,...,0-99,127002,U,117802,U,0.903090,0.0,0.615385,0.416667,0.079912
1007,13,2,6,7,20,2.043200e+01,False,15,BlindSearch,Simulados,...,0-99,113002,S,123202,S,1.310311,0.0,0.461538,0.583333,0.115946


In [66]:
base_path = '../data/processed/three-classes/'

db_real.to_csv(base_path + 'real.csv', index = False)
db_train.to_csv(base_path + 'train.csv', index = False)
db_test.to_csv(base_path + 'test.csv', index = False)