In [1]:
import pandas as pd 
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import train_test_split
from skfda.preprocessing.dim_reduction import variable_selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [2]:
results_dir = Path('./results')
preprocessed_clinical_file = results_dir/'preprocessed_clinical_data_v2.csv'
labels_file = results_dir/'labels.csv'

In [3]:
clinical_df = pd.read_csv(preprocessed_clinical_file)
labels_df = pd.read_csv(labels_file)
labels_df = labels_df.set_index('#Patient Identifier')
labels_df

Unnamed: 0_level_0,event
#Patient Identifier,Unnamed: 1_level_1
MB-0000,0
MB-0002,0
MB-0005,0
MB-0006,0
MB-0008,1
...,...
MB-7295,0
MB-7296,1
MB-7297,0
MB-7298,0


In [43]:
gene_exp_df = pd.read_csv(results_dir/'gene_expression.csv')
gene_exp_df.shape

(20603, 1983)

In [44]:
cna_df = pd.read_csv(results_dir/'cna.csv')
cna_df.shape

(22544, 1983)

## Transpose and remove some columns for future work

In [45]:
gene_exp_df = gene_exp_df.reset_index(drop=True).drop(gene_exp_df.columns[0],axis=1)
gene_exp_df.rename(columns={gene_exp_df.columns[0]: '#Patient Identifier' }, inplace = True)
gene_exp_df = gene_exp_df.set_index('#Patient Identifier')
gene_exp_df=gene_exp_df.drop(columns=['Entrez_Gene_Id']).transpose()
gene_exp_df

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,SBF2-AS1,VN1R4,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A
MB-0362,8.676978,6.075331,5.838270,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.335488,5.309543,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.471839,5.579198,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.316790
MB-0386,9.033589,5.910885,6.030718,10.111816,7.959291,5.689533,6.529312,5.482155,5.463788,5.254294,...,5.322604,5.484665,5.556244,7.934309,6.011908,8.406332,8.087722,6.366335,7.580336,5.461617
MB-0574,8.814855,5.628740,5.849428,6.116868,9.206376,5.439130,6.430102,5.398675,5.409761,5.512298,...,5.520733,5.419803,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.227130
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.338580,5.430874,...,5.542151,5.340978,5.616935,7.613439,5.678840,8.161977,5.327687,6.252966,6.848395,5.057761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-5453,9.606915,7.427494,5.927409,5.927031,8.520545,5.550549,5.841476,5.832439,7.180979,5.437498,...,5.199192,5.153886,5.535933,7.048923,6.231014,7.560101,5.397010,7.088676,7.248336,5.436415
MB-5471,9.049296,6.850000,6.117095,6.374305,8.499637,5.497546,6.351428,5.544773,5.919886,5.374992,...,5.377531,5.434900,5.454926,7.733413,6.154021,7.941895,5.415928,6.110477,7.596215,5.094339
MB-5127,8.858622,6.550450,5.936371,5.963092,9.320207,5.690297,7.280037,5.171260,5.937993,5.338428,...,5.267150,5.488637,5.630339,7.311774,5.983834,7.866579,5.242482,6.316304,6.620605,5.350708
MB-4313,8.415867,6.831722,7.408960,9.181768,6.804085,5.730308,7.642485,5.900949,6.019427,5.097951,...,6.264994,5.506242,5.570980,9.652446,7.511728,7.752503,5.320171,7.594277,7.437116,5.329209


In [46]:
cna_df = cna_df.reset_index(drop=True).drop(cna_df.columns[0],axis=1)
cna_df.rename(columns={cna_df.columns[0]: '#Patient Identifier' }, inplace = True)
cna_df = cna_df.set_index('#Patient Identifier')
cna_df=cna_df.drop(columns=['Entrez_Gene_Id']).transpose()
cna_df

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MB-0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MB-0039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
MB-0045,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,-2.0,0.0
MB-0046,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0
MB-0048,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-6020,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,2.0
MB-6213,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MB-6230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0
MB-7148,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# Needed to remove negative values in cna data for future work by setting max neg value as 0
min_values = cna_df.min(axis=1)
cna_df_non_negative = cna_df.sub(min_values, axis=0).clip(lower=0)
cna_df = cna_df_non_negative.add(abs(min_values), axis=0)
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MB-0000,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
MB-0039,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0
MB-0045,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,6.0,...,4.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0,2.0,4.0
MB-0046,4.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,...,4.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0
MB-0048,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,4.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0


In [48]:
gene_exp_df['event'] = labels_df.loc[gene_exp_df.index]['event']
gene_exp_df = gene_exp_df.dropna(subset='event')
gene_exp_df

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,VN1R4,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A,event
MB-0362,8.676978,6.075331,5.838270,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.309543,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591,1
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.579198,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.316790,1
MB-0386,9.033589,5.910885,6.030718,10.111816,7.959291,5.689533,6.529312,5.482155,5.463788,5.254294,...,5.484665,5.556244,7.934309,6.011908,8.406332,8.087722,6.366335,7.580336,5.461617,0
MB-0574,8.814855,5.628740,5.849428,6.116868,9.206376,5.439130,6.430102,5.398675,5.409761,5.512298,...,5.419803,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.227130,0
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.338580,5.430874,...,5.340978,5.616935,7.613439,5.678840,8.161977,5.327687,6.252966,6.848395,5.057761,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-5453,9.606915,7.427494,5.927409,5.927031,8.520545,5.550549,5.841476,5.832439,7.180979,5.437498,...,5.153886,5.535933,7.048923,6.231014,7.560101,5.397010,7.088676,7.248336,5.436415,1
MB-5471,9.049296,6.850000,6.117095,6.374305,8.499637,5.497546,6.351428,5.544773,5.919886,5.374992,...,5.434900,5.454926,7.733413,6.154021,7.941895,5.415928,6.110477,7.596215,5.094339,0
MB-5127,8.858622,6.550450,5.936371,5.963092,9.320207,5.690297,7.280037,5.171260,5.937993,5.338428,...,5.488637,5.630339,7.311774,5.983834,7.866579,5.242482,6.316304,6.620605,5.350708,0
MB-4313,8.415867,6.831722,7.408960,9.181768,6.804085,5.730308,7.642485,5.900949,6.019427,5.097951,...,5.506242,5.570980,9.652446,7.511728,7.752503,5.320171,7.594277,7.437116,5.329209,0


In [49]:
cna_df['event'] = labels_df.loc[cna_df.index]['event']
cna_df = cna_df.dropna(subset='event')
cna_df

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,event
MB-0000,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0
MB-0039,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,0
MB-0045,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,6.0,...,4.0,4.0,4.0,4.0,5.0,3.0,4.0,2.0,4.0,0
MB-0046,4.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,1
MB-0048,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,5.0,4.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-6020,5.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,6.0,1
MB-6213,4.0,4.0,4.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0
MB-6230,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,0
MB-7148,4.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,...,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1


## Imputing missing rows using median values

In [50]:
X = gene_exp_df.drop(columns=['event']) 
y = gene_exp_df['event'] 
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
gene_exp_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
gene_exp_df['event'] = labels_df['event']
gene_exp_df.head()

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,VN1R4,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A,event
MB-0362,8.676978,6.075331,5.83827,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.309543,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591,1
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.579198,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.31679,1
MB-0386,9.033589,5.910885,6.030718,10.111816,7.959291,5.689533,6.529312,5.482155,5.463788,5.254294,...,5.484665,5.556244,7.934309,6.011908,8.406332,8.087722,6.366335,7.580336,5.461617,0
MB-0574,8.814855,5.62874,5.849428,6.116868,9.206376,5.43913,6.430102,5.398675,5.409761,5.512298,...,5.419803,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.22713,0
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.33858,5.430874,...,5.340978,5.616935,7.613439,5.67884,8.161977,5.327687,6.252966,6.848395,5.057761,1


In [51]:
X = cna_df.drop(columns=['event']) 
y = cna_df['event'] 
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
cna_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
cna_df['event'] = labels_df['event']
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,event
MB-0000,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0
MB-0039,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,0
MB-0045,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,4.0,6.0,...,4.0,4.0,4.0,4.0,5.0,3.0,4.0,2.0,4.0,0
MB-0046,4.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,...,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,1
MB-0048,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,5.0,4.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0,0


In [52]:
gene_exp_comb = gene_exp_df.copy()
cna_comb = cna_df.copy()

## Minority oversampling to fix class imbalance using SVM SMOTE

In [53]:
sm = SVMSMOTE(random_state=42)

X = gene_exp_df.drop(columns=['event'])
y = gene_exp_df['event'].astype('category')
X_res, y_res = sm.fit_resample(X,y)
X_res_df = pd.DataFrame(X_res, columns=gene_exp_df.drop(columns=['event']).columns)
y_res_df = pd.DataFrame(y_res, columns=['event'])

gene_exp_df = pd.concat([X_res_df, y_res_df], axis=1)
gene_exp_df

Unnamed: 0,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,VN1R4,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A,event
0,8.676978,6.075331,5.838270,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.309543,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591,1
1,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.579198,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.316790,1
2,9.033589,5.910885,6.030718,10.111816,7.959291,5.689533,6.529312,5.482155,5.463788,5.254294,...,5.484665,5.556244,7.934309,6.011908,8.406332,8.087722,6.366335,7.580336,5.461617,0
3,8.814855,5.628740,5.849428,6.116868,9.206376,5.439130,6.430102,5.398675,5.409761,5.512298,...,5.419803,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.227130,0
4,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.338580,5.430874,...,5.340978,5.616935,7.613439,5.678840,8.161977,5.327687,6.252966,6.848395,5.057761,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,10.089448,5.305789,5.534365,8.369373,7.777669,5.516465,6.013735,5.311986,5.450231,5.438862,...,5.333695,5.321843,7.612703,5.952500,8.061218,9.443330,6.569641,6.665575,4.979254,1
2974,9.375449,5.236951,6.102195,6.191456,8.076714,5.845849,7.833502,5.493224,5.537083,5.364006,...,5.735634,5.655334,8.456308,8.209493,8.080022,5.208751,6.709596,7.847558,5.455959,1
2975,8.591322,6.035535,5.780881,5.596786,7.312700,5.419849,6.139072,6.094514,6.099713,5.537310,...,5.341426,5.299533,7.379776,6.414569,8.298939,5.198275,6.324936,7.534954,5.350322,1
2976,8.410811,5.406690,5.404390,5.637879,7.679751,5.626149,6.629248,5.420894,5.602825,5.335321,...,5.388964,5.497645,7.295110,6.178994,7.998289,4.996604,6.089505,6.677450,5.225374,1


In [54]:
sm = SVMSMOTE(random_state=42)

X = cna_df.drop(columns=['event']).astype(int)
y = cna_df['event'].astype('category')
X_res, y_res = sm.fit_resample(X,y)
X_res_df = pd.DataFrame(X_res, columns=cna_df.drop(columns=['event']).columns)
y_res_df = pd.DataFrame(y_res, columns=['event'])

cna_df = pd.concat([X_res_df, y_res_df], axis=1)
cna_df

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,event
0,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
1,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,3,4,4,0
2,3,3,4,3,3,3,3,4,4,6,...,4,4,4,4,5,3,4,2,4,0
3,4,4,4,3,3,3,3,4,3,4,...,4,3,3,4,4,4,4,3,4,1
4,4,4,5,4,4,4,4,4,3,4,...,5,4,4,4,4,4,5,3,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,4,4,2,6,6,6,6,4,4,4,...,2,3,3,4,4,4,2,3,3,1
2974,4,4,2,5,5,5,5,2,2,4,...,2,3,3,2,5,5,4,3,4,1
2975,4,4,3,5,5,5,5,3,3,4,...,2,2,2,2,5,5,4,3,4,1
2976,4,4,3,4,4,4,4,2,2,4,...,3,3,3,2,5,5,4,3,4,1


## Selecting top 400 features using Chi-Square method

In [55]:
X = gene_exp_df.drop(columns=['event']) 
y = gene_exp_df['event'] 

k=400
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_selected = chi2_selector.fit_transform(X, y)
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_indices = list(selected_feature_indices) + [X.shape[1]]
gene_exp_df = gene_exp_df[gene_exp_df.columns[selected_feature_indices]]
gene_exp_df

Unnamed: 0,TAT,GSTM1,GAL,GLRB,STAC2,BBOF1,AKR7A3,FGFR4,SERHL2,ASS1,...,S100A7A,MCM10,CST5,TFF3,UPP1,PCDHB2,CDH3,SLC29A4,SLC39A6,event
0,5.793455,5.734685,5.264142,7.210677,5.756352,8.120881,9.389657,5.643984,8.122241,6.672313,...,5.402056,6.467056,5.778011,14.282490,7.992337,7.740255,6.660659,7.259379,10.567694,1
1,6.565077,5.813390,7.869450,5.660398,6.233390,6.602225,12.137391,10.917248,12.477518,7.778699,...,6.273570,7.158490,5.273706,13.223147,8.630969,7.905112,8.200274,8.030981,7.420700,1
2,5.401030,5.023541,5.940572,7.197512,5.607358,7.105292,9.530319,5.535386,5.948559,8.858626,...,6.031488,6.547450,9.080347,12.776918,7.887089,7.403539,6.124498,8.726381,10.211003,0
3,5.289979,5.256760,5.632154,8.251696,5.468715,7.302257,7.543094,6.034698,7.544609,7.883977,...,5.294679,6.104780,8.376762,12.260590,7.292711,6.796958,6.182236,6.591816,10.929505,0
4,5.323448,5.320953,6.145348,6.360203,5.362153,6.836189,8.198283,5.768222,6.223962,8.959244,...,5.122453,7.481652,5.678842,10.403710,8.727926,7.936050,8.077851,6.795019,8.356800,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,5.654484,3.364080,5.734562,5.477148,6.212355,8.130922,11.942046,6.497590,11.303759,9.807192,...,5.276939,6.198917,5.339553,7.927578,7.789259,6.040451,7.545583,6.365973,7.879914,1
2974,6.409078,4.720472,4.872024,6.307777,3.905886,7.123040,7.839258,6.474858,9.700648,8.553804,...,5.760818,7.249941,5.131709,12.112472,8.757138,11.001287,6.662601,6.489852,6.483831,1
2975,8.310181,9.807547,5.892393,6.313346,8.063927,7.666274,9.508512,9.161449,13.868633,10.548589,...,14.630055,6.924788,5.458524,12.578552,7.443743,9.090072,7.860251,8.699876,8.338464,1
2976,6.554067,5.878456,5.294483,5.836715,8.385726,6.304772,7.582571,8.777301,11.770605,10.962425,...,10.414719,6.327805,5.443732,7.775358,8.902924,6.665993,7.087818,9.071071,7.355686,1


In [56]:
gene_exp_comb = gene_exp_comb[gene_exp_comb.columns[selected_feature_indices]]
gene_exp_comb

#Patient Identifier,TAT,GSTM1,GAL,GLRB,STAC2,BBOF1,AKR7A3,FGFR4,SERHL2,ASS1,...,S100A7A,MCM10,CST5,TFF3,UPP1,PCDHB2,CDH3,SLC29A4,SLC39A6,event
MB-0362,5.793455,5.734685,5.264142,7.210677,5.756352,8.120881,9.389657,5.643984,8.122241,6.672313,...,5.402056,6.467056,5.778011,14.282490,7.992337,7.740255,6.660659,7.259379,10.567694,1
MB-0346,6.565077,5.813390,7.869450,5.660398,6.233390,6.602225,12.137391,10.917248,12.477518,7.778699,...,6.273570,7.158490,5.273706,13.223147,8.630969,7.905112,8.200274,8.030981,7.420700,1
MB-0386,5.401030,5.023541,5.940572,7.197512,5.607358,7.105292,9.530319,5.535386,5.948559,8.858626,...,6.031488,6.547450,9.080347,12.776918,7.887089,7.403539,6.124498,8.726381,10.211003,0
MB-0574,5.289979,5.256760,5.632154,8.251696,5.468715,7.302257,7.543094,6.034698,7.544609,7.883977,...,5.294679,6.104780,8.376762,12.260590,7.292711,6.796958,6.182236,6.591816,10.929505,0
MB-0185,5.323448,5.320953,6.145348,6.360203,5.362153,6.836189,8.198283,5.768222,6.223962,8.959244,...,5.122453,7.481652,5.678842,10.403710,8.727926,7.936050,8.077851,6.795019,8.356800,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-5453,8.952480,7.844750,5.898970,6.540355,7.804999,7.590927,8.715114,6.274398,6.851518,8.735822,...,5.223783,5.918056,5.341405,11.820328,7.195297,6.884461,9.491800,6.384867,9.002941,1
MB-5471,5.522757,6.525984,6.927027,5.999093,5.534790,7.782905,9.909053,5.801690,6.936829,10.185926,...,5.680012,6.458758,5.708246,11.235742,7.206908,7.246393,7.320614,6.566586,8.424036,0
MB-5127,5.641610,5.427059,5.419183,8.377215,5.558466,8.284851,7.518282,5.839152,8.538036,6.111794,...,5.381887,6.499600,5.969561,13.588736,7.669710,5.761209,6.639532,6.015033,12.141104,0
MB-4313,9.460065,5.598304,5.413246,5.791677,5.526026,6.354431,8.688709,6.038270,8.425811,8.073972,...,5.126337,7.095661,5.334263,13.110442,9.250626,5.779895,5.955864,8.177306,6.622884,0


In [57]:
X = cna_df.drop(columns=['event']) 
y = cna_df['event'] 

k=400
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_selected = chi2_selector.fit_transform(X, y)
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_indices = list(selected_feature_indices) + [X.shape[1]]
cna_df = cna_df[cna_df.columns[selected_feature_indices]]
cna_df

Unnamed: 0,ABCG1,ABI1,ACBD5,ACBD7,ADAMTS1,ADAMTS5,ADARB1,AGPAT3,AIRE,ANKRD26,...,WAC,WAC-AS1,WDR4,WRB,XKR4,YBEY,YME1L1,ZBTB21,ZNF295-AS1,event
0,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
1,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,0
2,4,4,4,4,4,4,3,3,3,4,...,4,4,4,4,4,5,4,4,4,0
3,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,6,4,4,4,4,1
4,4,5,5,5,4,4,4,4,4,5,...,5,5,4,4,4,4,5,4,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2973,5,5,5,6,5,5,5,5,5,5,...,5,5,5,5,6,5,5,5,5,1
2974,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,7,6,6,6,6,1
2975,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,1
2976,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,1


In [58]:
cna_comb = cna_comb[cna_comb.columns[selected_feature_indices]]
cna_comb

#Patient Identifier,ABCG1,ABI1,ACBD5,ACBD7,ADAMTS1,ADAMTS5,ADARB1,AGPAT3,AIRE,ANKRD26,...,WAC,WAC-AS1,WDR4,WRB,XKR4,YBEY,YME1L1,ZBTB21,ZNF295-AS1,event
MB-0000,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0
MB-0039,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0
MB-0045,4.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,0
MB-0046,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,6.0,4.0,4.0,4.0,4.0,1
MB-0048,4.0,5.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,...,5.0,5.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MB-6020,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1
MB-6213,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0
MB-6230,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0
MB-7148,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1


## Split the data into train, val, test groups

In [59]:
train_val_data, test_data = train_test_split(gene_exp_df, test_size=0.15, stratify=gene_exp_df['event'], random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.125, stratify=train_val_data['event'], random_state=42)

In [60]:
def get_split_group(id_code):
    if id_code in list(train_data.index):
        return 'train'
    elif id_code in val_data.index:
        return 'val'
    elif id_code in test_data.index:
        return 'test'
    else:
        print('Found NaN!')

In [61]:
gene_exp_df['group'] = 'Missing'
gene_exp_df['group'] = [get_split_group(x) for x in list(gene_exp_df.index)]
gene_exp_df['group'].value_counts()

  gene_exp_df['group'] = 'Missing'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_exp_df['group'] = 'Missing'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_exp_df['group'] = [get_split_group(x) for x in list(gene_exp_df.index)]


group
train    2214
test      447
val       317
Name: count, dtype: int64

In [62]:
train_val_data, test_data = train_test_split(cna_df, test_size=0.15, stratify=cna_df['event'], random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.125, stratify=train_val_data['event'], random_state=42)

In [63]:
cna_df['group'] = 'Missing'
cna_df['group'] = [get_split_group(x) for x in list(cna_df.index)]
cna_df['group'].value_counts()

  cna_df['group'] = 'Missing'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cna_df['group'] = 'Missing'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cna_df['group'] = [get_split_group(x) for x in list(cna_df.index)]


group
train    2214
test      447
val       317
Name: count, dtype: int64

In [64]:
gene_exp_df.to_csv('preprocessed_gene_expression_v2.csv', index=False)
cna_df.to_csv('preprocessed_cna_v2.csv', index=False)

In [67]:
gene_exp_comb.to_csv('preprocessed_gene_comb_v2.csv', index=True)
cna_comb.to_csv('preprocessed_cna_comb_v2.csv', index=True)