# Data Preparation


In [12]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

## Read File


In [13]:
# Read merged dataFrame with Outliers removed
file='DataPatientsWithGTNoOutliers.csv'
df = pd.read_csv(file,sep=';',index_col=0, na_values='n.a.', dtype={'Stage': 'object', 'Age': 'float64'})


## Split between training and testing (external data split)


In [14]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)


In [15]:
df_train.shape

(1879, 5261)

In [16]:
df_test.shape

(470, 5261)

### Check if classes are equally balanced

In [17]:
print("% of nonMalignant in df", 100*df.loc[df['Class_group'] == 'nonMalignant'].shape[0]/df.shape[0])
print("% of nonMalignant in df_train", 100*df_train.loc[df_train['Class_group'] == 'nonMalignant'].shape[0]/df_train.shape[0])
print("% of nonMalignant in df_val", 100*df_test.loc[df_test['Class_group'] == 'nonMalignant'].shape[0]/df_test.shape[0])

% of nonMalignant in df 30.779054916985952
% of nonMalignant in df_train 30.122405534858967
% of nonMalignant in df_val 33.40425531914894


## Remove genes that bring few information

### Genes with low variance

In [18]:
# var: Return unbiased variance over requested axis (divided by N-1)
columns = df_train.columns 
df_genes = df_train[columns[:-10]]
var_genes = df_genes.var()
var_genes.describe()

count    5.251000e+03
mean     9.924783e+05
std      3.306271e+07
min      3.998097e+00
25%      1.500777e+02
50%      5.714121e+02
75%      3.057345e+03
max      1.806556e+09
dtype: float64

In [19]:
# Remove genes with variance is lower than 150 (25%)
genes_to_remove = var_genes[var_genes<150].index
# genes_to_remove.__len__ # 1382

In [20]:
df_train.drop(genes_to_remove, axis=1, inplace=True)
df_test.drop(genes_to_remove, axis=1, inplace=True)

In [22]:
print("Train: ", df_train.shape) 
print("Val: ", df_test.shape)

Train:  (1879, 3948)
Val:  (470, 3948)


### Genes with low absolute value 

In [27]:
# Remove genes where no value > threshold
columns = df_train.columns 
df_genes = df_train[columns[:-10]]
max_genes = df_genes.max()
max_genes.describe()

count      2881.000000
mean       1931.885037
std       13943.823300
min         200.089580
25%         295.125520
50%         480.801024
75%         991.302078
max      591697.185022
dtype: float64

In [28]:
# Remove genes with max value is lower than 200
genes_to_remove = max_genes[max_genes<200].index
# genes_to_remove.shape   # 1078

In [29]:
df_train.drop(genes_to_remove, axis=1, inplace=True)
df_test.drop(genes_to_remove, axis=1, inplace=True)

In [31]:
print("Train: ", df_train.shape) 
print("Val: ", df_test.shape)

Train:  (1879, 2891)
Val:  (470, 2891)


## Save data

In [32]:
# Save merged dataFrame
file='DataPatients_train.csv'
df_train.to_csv(file, sep=';', na_rep='n.a.')

file='DataPatients_test.csv'
df_test.to_csv(file, sep=';', na_rep='n.a.')

In [33]:
df_test.head(2)

Unnamed: 0,ENSG00000000419,ENSG00000000938,ENSG00000002330,ENSG00000002549,ENSG00000002586,ENSG00000003056,ENSG00000003436,ENSG00000003756,ENSG00000004059,ENSG00000004142,...,Sample ID,Stage,Sex,Age,Sample-supplying institution,Training series,Evaluation series,Validation series,Class_group,Patient_group
2288-MGH-NSCLC-L51-TR521,10.029376,9.117101,41.958618,52.46693,3943.586674,20.162047,111.060717,0.0,419.563521,2.884146,...,MGH-NSCLC-L51-TR521,IV,F,64.0,Institute 4,0,0,1,Malignant,Non-Small-Cell Lung Cancer
783-MGH-BrCa-P16-TR613,7.703162,24.275263,43.883211,42.312633,3774.882464,28.428898,87.091961,3.796378,377.11326,17.229311,...,MGH-BrCa-P16-TR613,IV,F,58.0,Institute 4,0,0,1,Malignant,Breast Cancer
