In [177]:
import pandas as pd 
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import chi2
from skfda.preprocessing.dim_reduction import variable_selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [182]:
results_dir = Path('./results')
preprocessed_clinical_file = results_dir/'preprocessed_clinical_data.csv'

In [183]:
clinical_df = pd.read_csv(preprocessed_clinical_file)
clinical_df.index=clinical_df['#Patient Identifier']
clinical_df.head()

Unnamed: 0_level_0,#Patient Identifier,Lymph nodes examined positive,Nottingham prognostic index,Integrative Cluster,Age at Diagnosis,Pam50 + Claudin-low subtype,event,group
#Patient Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MB-5460,MB-5460,1.0,0.561567,10,0.452663,0,0,train
MB-0452,MB-0452,0.0,0.568284,10,0.437063,3,0,train
MB-0351,MB-0351,1.0,0.0,3,0.640129,3,0,train
MB-0543,MB-0543,1.0,0.756343,0,0.322351,0,0,val
MB-5013,MB-5013,1.0,0.567164,3,0.627084,2,0,train


In [184]:
gene_exp_df = pd.read_csv(results_dir/'gene_expression.csv')
gene_exp_df.shape

(20603, 1983)

In [185]:
cna_df = pd.read_csv(results_dir/'cna.csv')
cna_df.shape

(22544, 1983)

## Get the same samples as the preprocessed clinical data

In [186]:
gene_exp_patients = set(gene_exp_df.columns[2:])
cna_patients = set(cna_df.columns[2:])
clinical_patients = set(clinical_df.index)

common_patients = gene_exp_patients.intersection(cna_patients, clinical_patients)

gene_exp_columns_to_drop = [col for col in gene_exp_df.columns[2:] if col not in common_patients]
gene_exp_df = gene_exp_df.drop(columns=gene_exp_columns_to_drop)

cna_columns_to_drop = [col for col in cna_df.columns[2:] if col not in common_patients]
cna_df = cna_df.drop(columns=cna_columns_to_drop)

In [187]:
cna_df.shape

(22544, 993)

In [188]:
gene_exp_df.shape

(20603, 993)

In [189]:
gene_exp_df.head()

Unnamed: 0.1,Unnamed: 0,Hugo_Symbol,MB-0362,MB-0346,MB-0574,MB-0185,MB-0503,MB-0189,MB-0658,MB-0605,...,MB-5552,MB-6189,MB-6122,MB-6192,MB-4820,MB-5527,MB-5465,MB-5453,MB-5127,MB-4823
0,0,RERE,8.676978,9.653589,8.814855,8.736406,9.274265,8.922748,8.293003,8.576128,...,9.072979,9.522033,8.756024,8.804947,9.991215,9.595923,8.131637,9.606915,8.858622,9.902776
1,1,RNF165,6.075331,6.687887,5.62874,6.392422,5.908698,6.461321,6.725088,6.058684,...,5.887199,7.823506,7.101087,5.601472,7.10316,6.418987,9.101942,7.427494,6.55045,7.338318
2,2,PHF7,5.83827,5.600876,5.849428,5.542133,5.964661,5.536764,5.659488,6.034527,...,5.790621,5.689935,5.653481,5.922894,6.181493,5.992153,5.644587,5.927409,5.936371,5.875759
3,3,CIDEA,6.397503,5.246319,6.116868,5.184098,7.828171,7.196945,7.176816,8.255079,...,5.736294,5.736615,6.221834,6.488136,5.612846,6.24116,5.611189,5.927031,5.963092,5.596873
4,4,TENT2,7.906217,8.267256,9.206376,8.162845,8.706646,8.385897,8.177391,8.098588,...,8.094144,8.255405,8.172465,8.534527,8.203819,8.813103,7.798269,8.520545,9.320207,8.436732


## Transpose and remove some columns for future work

In [190]:
gene_exp_df = gene_exp_df.reset_index(drop=True).drop(gene_exp_df.columns[0],axis=1)
gene_exp_df.rename(columns={gene_exp_df.columns[0]: '#Patient Identifier' }, inplace = True)
gene_exp_df = gene_exp_df.set_index('#Patient Identifier').transpose()
gene_exp_df.head()

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,SBF2-AS1,VN1R4,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A
MB-0362,8.676978,6.075331,5.83827,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.335488,5.309543,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.471839,5.579198,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.31679
MB-0574,8.814855,5.62874,5.849428,6.116868,9.206376,5.43913,6.430102,5.398675,5.409761,5.512298,...,5.520733,5.419803,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.22713
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.33858,5.430874,...,5.542151,5.340978,5.616935,7.613439,5.67884,8.161977,5.327687,6.252966,6.848395,5.057761
MB-0503,9.274265,5.908698,5.964661,7.828171,8.706646,5.417484,6.684893,5.266674,5.490693,5.363378,...,5.492591,5.324644,5.329602,7.701394,6.073769,8.137014,5.498185,6.214301,6.839417,5.027476


In [191]:
cna_df.head()

Unnamed: 0.1,Unnamed: 0,Hugo_Symbol,MB-0046,MB-0053,MB-0079,MB-0083,MB-0093,MB-0095,MB-0097,MB-0100,...,MB-5255,MB-5325,MB-5349,MB-5387,MB-5394,MB-5460,MB-5585,MB-5625,MB-6020,MB-7148
0,0,A1BG,0,0,-1,-1,-1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,A1BG-AS1,0,0,-1,-1,-1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,A1CF,0,0,2,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,3,A2M,-1,0,1,0,-1,0,1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,A2M-AS1,-1,0,1,0,-1,0,1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
cna_df = cna_df.reset_index(drop=True).drop(cna_df.columns[0],axis=1)
cna_df.rename(columns={cna_df.columns[0]: '#Patient Identifier' }, inplace = True)
cna_df = cna_df.set_index('#Patient Identifier').transpose()+1 #Adding 1 to avoid negative values
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MB-0046,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
MB-0053,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,1.0,0.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0
MB-0079,0.0,0.0,3.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,...,2.0,3.0,3.0,3.0,1.0,1.0,1.0,2.0,0.0,1.0
MB-0083,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
MB-0093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


In [193]:
# Needed to remove negative values in cna data for future work by setting max neg value as 0
min_values = cna_df.min(axis=1)
cna_df_non_negative = cna_df.sub(min_values, axis=0).clip(lower=0)
cna_df = cna_df_non_negative.add(abs(min_values), axis=0)
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MB-0046,3.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,...,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0
MB-0053,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,...,3.0,3.0,2.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0
MB-0079,2.0,2.0,5.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,...,4.0,5.0,5.0,5.0,3.0,3.0,3.0,4.0,2.0,3.0
MB-0083,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
MB-0093,2.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0


In [194]:
gene_exp_df['event'] = clinical_df['event']
gene_exp_df['group'] = clinical_df['group']
gene_exp_df.head()

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A,event,group
MB-0362,8.676978,6.075331,5.83827,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591,1,val
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.31679,1,test
MB-0574,8.814855,5.62874,5.849428,6.116868,9.206376,5.43913,6.430102,5.398675,5.409761,5.512298,...,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.22713,0,train
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.33858,5.430874,...,5.616935,7.613439,5.67884,8.161977,5.327687,6.252966,6.848395,5.057761,1,test
MB-0503,9.274265,5.908698,5.964661,7.828171,8.706646,5.417484,6.684893,5.266674,5.490693,5.363378,...,5.329602,7.701394,6.073769,8.137014,5.498185,6.214301,6.839417,5.027476,0,train


In [195]:
cna_df['event'] = clinical_df['event']
cna_df['group'] = clinical_df['group']
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,event,group
MB-0046,3.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,...,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,1,val
MB-0053,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,...,2.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,0,train
MB-0079,2.0,2.0,5.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,...,5.0,5.0,3.0,3.0,3.0,4.0,2.0,3.0,1,train
MB-0083,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0,train
MB-0093,2.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,0,train


## Imputing missing rows using median values

In [196]:
X = gene_exp_df.drop(columns=['event', 'group']) 
y = gene_exp_df['event'] 
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
gene_exp_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
gene_exp_df['event'] = clinical_df['event']
gene_exp_df['group'] = clinical_df['group']
gene_exp_df.head()

#Patient Identifier,RERE,RNF165,PHF7,CIDEA,TENT2,SLC17A3,SDS,ATP6V1C2,F3,FAM71C,...,TRPV5,UGGT1,CR590356,VPS72,CSMD3,CC2D1A,IGSF9,FAM71A,event,group
MB-0362,8.676978,6.075331,5.83827,6.397503,7.906217,5.702379,6.930741,5.332863,5.275676,5.443896,...,5.449278,7.688492,5.783457,8.084979,5.161796,6.353215,7.304643,5.049591,1,val
MB-0346,9.653589,6.687887,5.600876,5.246319,8.267256,5.521794,6.141689,7.563477,5.376381,5.319857,...,5.657015,7.804165,6.487002,8.349115,5.197392,6.132355,7.933324,5.31679,1,test
MB-0574,8.814855,5.62874,5.849428,6.116868,9.206376,5.43913,6.430102,5.398675,5.409761,5.512298,...,5.450339,7.744562,6.032705,8.310019,5.780062,6.424048,6.903654,5.22713,0,train
MB-0185,8.736406,6.392422,5.542133,5.184098,8.162845,5.464326,6.105427,5.026018,5.33858,5.430874,...,5.616935,7.613439,5.67884,8.161977,5.327687,6.252966,6.848395,5.057761,1,test
MB-0503,9.274265,5.908698,5.964661,7.828171,8.706646,5.417484,6.684893,5.266674,5.490693,5.363378,...,5.329602,7.701394,6.073769,8.137014,5.498185,6.214301,6.839417,5.027476,0,train


In [197]:
X = cna_df.drop(columns=['event', 'group']) 
y = cna_df['event'] 
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
cna_df = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
cna_df['event'] = clinical_df['event']
cna_df['group'] = clinical_df['group']
cna_df.head()

#Patient Identifier,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,event,group
MB-0046,3.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,2.0,3.0,...,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,1,val
MB-0053,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,...,2.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,0,train
MB-0079,2.0,2.0,5.0,4.0,4.0,4.0,4.0,3.0,3.0,3.0,...,5.0,5.0,3.0,3.0,3.0,4.0,2.0,3.0,1,train
MB-0083,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0,train
MB-0093,2.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,0,train


## Selecting top 400 features using Chi-Square method

In [198]:
X = gene_exp_df.drop(columns=['event', 'group']) 
y = gene_exp_df['event'] 

k=400
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_selected = chi2_selector.fit_transform(X, y)
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
selected_features

Index(['TTYH1', 'GAL', 'GLRB', 'AKR7A3', 'FGFR4', 'CGNL1', 'ASS1', 'SUSD2',
       'CDCA5', 'TMPRSS2',
       ...
       'IGFBP4', 'CEP55', 'NELL2', 'HMGB3', 'MCM10', 'TFF3', 'PCDHB2', 'CDH3',
       'SLC39A6', 'SHISA2'],
      dtype='object', name='#Patient Identifier', length=400)

In [199]:
gene_exp_df = gene_exp_df[selected_features]
gene_exp_df['event'] = clinical_df['event']
gene_exp_df['group'] = clinical_df['group']
gene_exp_df.head()

#Patient Identifier,TTYH1,GAL,GLRB,AKR7A3,FGFR4,CGNL1,ASS1,SUSD2,CDCA5,TMPRSS2,...,NELL2,HMGB3,MCM10,TFF3,PCDHB2,CDH3,SLC39A6,SHISA2,event,group
MB-0362,5.472703,5.264142,7.210677,9.389657,5.643984,10.358713,6.672313,7.85761,9.061243,6.212845,...,11.347538,7.740359,6.467056,14.28249,7.740255,6.660659,10.567694,9.55439,1,val
MB-0346,6.029769,7.86945,5.660398,12.137391,10.917248,9.774449,7.778699,11.863747,10.22192,7.127564,...,6.30154,8.258731,7.15849,13.223147,7.905112,8.200274,7.4207,8.356079,1,test
MB-0574,5.565192,5.632154,8.251696,7.543094,6.034698,9.189665,7.883977,6.861732,7.581227,5.809794,...,6.586016,7.605322,6.10478,12.26059,6.796958,6.182236,10.929505,6.965049,0,train
MB-0185,5.465966,6.145348,6.360203,8.198283,5.768222,7.313842,8.959244,9.089724,10.213103,6.366633,...,6.026236,7.991045,7.481652,10.40371,7.93605,8.077851,8.3568,10.565691,1,test
MB-0503,5.320194,5.670347,7.908132,10.133659,5.682765,8.943139,7.599522,9.152212,7.655903,6.964962,...,7.278149,6.002129,5.708727,13.730393,6.453811,8.043682,10.209835,9.687987,0,train


In [200]:
X = cna_df.drop(columns=['event', 'group']) 
y = cna_df['event'] 

k=400
chi2_selector = SelectKBest(score_func=chi2, k=k)
X_selected = chi2_selector.fit_transform(X, y)
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
selected_features

Index(['AARD', 'ABCC11', 'ABCC12', 'ABHD16B', 'ABRA', 'ADCY7', 'ADRM1', 'AIM1',
       'AKTIP', 'AMFR',
       ...
       'ZFP64', 'ZFPM2', 'ZGPAT', 'ZHX1', 'ZHX2', 'ZNF217', 'ZNF423',
       'ZNF512B', 'ZNF572', 'ZNF706'],
      dtype='object', name='#Patient Identifier', length=400)

In [201]:
cna_df = cna_df[selected_features]
cna_df['event'] = clinical_df['event']
cna_df['group'] = clinical_df['group']
cna_df.head()

#Patient Identifier,AARD,ABCC11,ABCC12,ABHD16B,ABRA,ADCY7,ADRM1,AIM1,AKTIP,AMFR,...,ZGPAT,ZHX1,ZHX2,ZNF217,ZNF423,ZNF512B,ZNF572,ZNF706,event,group
MB-0046,3.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,...,3.0,3.0,3.0,4.0,2.0,3.0,3.0,3.0,1,val
MB-0053,4.0,2.0,2.0,4.0,4.0,2.0,5.0,3.0,2.0,2.0,...,4.0,4.0,4.0,5.0,2.0,4.0,4.0,3.0,0,train
MB-0079,5.0,3.0,3.0,3.0,5.0,3.0,3.0,2.0,3.0,3.0,...,3.0,5.0,5.0,3.0,3.0,3.0,5.0,5.0,1,train
MB-0083,1.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,0.0,...,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,0,train
MB-0093,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0,train


## Selecting top 20 features using mRMR

In [202]:
n = 20

X = gene_exp_df.drop(columns=['event', 'group']) 
y = gene_exp_df['event'] 

mi_scores = mutual_info_classif(X, y)
selector = SelectKBest(score_func=mutual_info_classif, k=n)
X_selected = selector.fit_transform(X, y)

selected_feature_indices = selector.get_support(indices=True)

selected_features = X.columns[selected_feature_indices]

selected_features

Index(['FUT3', 'STC2', 'LAD1', 'DRC3', 'KIF2C', 'NUSAP1', 'TMSB15A', 'BUB1',
       'C9orf116', 'SOX11', 'KRT7', 'NCCRP1', 'CRIPAK', 'LRP8', 'SLC5A6',
       'KIFC1', 'CD24', 'S100A8', 'SERPINA3', 'NAT1'],
      dtype='object', name='#Patient Identifier')

In [203]:
gene_exp_df = gene_exp_df[selected_features]
gene_exp_df['event'] = clinical_df['event']
gene_exp_df['group'] = clinical_df['group']
gene_exp_df.head()

#Patient Identifier,FUT3,STC2,LAD1,DRC3,KIF2C,NUSAP1,TMSB15A,BUB1,C9orf116,SOX11,...,CRIPAK,LRP8,SLC5A6,KIFC1,CD24,S100A8,SERPINA3,NAT1,event,group
MB-0362,5.601502,10.154267,8.438985,6.827285,7.068882,8.615072,6.796801,6.673336,9.155867,6.2646,...,7.490095,5.843003,8.755151,7.088528,8.861125,7.778289,13.582444,10.628267,1,val
MB-0346,7.821926,8.56781,10.847865,5.192595,7.489031,9.130215,7.161921,7.032191,8.046466,8.916459,...,6.808691,6.804273,10.934018,7.144363,10.336827,9.49108,10.20771,6.639451,1,test
MB-0574,5.685729,13.261676,7.941934,6.442102,6.365602,7.341534,7.957777,5.785531,9.073771,5.949662,...,7.859506,6.12392,8.685687,6.675911,10.607311,6.465733,13.731758,9.170203,0,train
MB-0185,5.589487,8.678581,7.765381,5.690393,7.337626,8.100728,8.592696,7.328139,7.942547,6.460894,...,6.934002,6.771014,8.944125,7.522679,11.671487,7.670585,10.922716,6.64577,1,test
MB-0503,5.366097,12.119527,6.940889,6.465758,6.443545,7.25694,6.106012,5.886902,8.951667,5.638091,...,7.928219,5.462626,9.846843,6.575966,9.610101,6.322715,12.564908,8.658371,0,train


In [204]:
n = 20

X = cna_df.drop(columns=['event', 'group']) 
y = cna_df['event'] 

mi_scores = mutual_info_classif(X, y)
selector = SelectKBest(score_func=mutual_info_classif, k=n)
X_selected = selector.fit_transform(X, y)

selected_feature_indices = selector.get_support(indices=True)

selected_features = X.columns[selected_feature_indices]

selected_features

Index(['APOOP5', 'BAALC', 'CCL17', 'COL20A1', 'COLEC10', 'GNAO1', 'HERPUD1',
       'LINC00919', 'LRP12', 'MRGBP', 'MT3', 'MYC', 'NDUFB9', 'ORC6',
       'SLC2A4RG', 'SOGA3', 'TAAR5', 'LINC02912', 'TPD52L1', 'ZHX1'],
      dtype='object', name='#Patient Identifier')

In [205]:
cna_df = cna_df[selected_features]
cna_df['event'] = clinical_df['event']
cna_df['group'] = clinical_df['group']
cna_df.head()

#Patient Identifier,APOOP5,BAALC,CCL17,COL20A1,COLEC10,GNAO1,HERPUD1,LINC00919,LRP12,MRGBP,...,NDUFB9,ORC6,SLC2A4RG,SOGA3,TAAR5,LINC02912,TPD52L1,ZHX1,event,group
MB-0046,2.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,...,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,1,val
MB-0053,5.0,4.0,2.0,4.0,4.0,2.0,2.0,2.0,4.0,5.0,...,4.0,2.0,4.0,3.0,3.0,4.0,3.0,4.0,0,train
MB-0079,3.0,5.0,2.0,3.0,5.0,3.0,3.0,3.0,5.0,3.0,...,5.0,3.0,3.0,2.0,2.0,5.0,2.0,5.0,1,train
MB-0083,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,2.0,...,1.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,0,train
MB-0093,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,0,train


In [206]:
gene_exp_df.to_csv('preprocessed_gene_expression.csv')
cna_df.to_csv('preprocessed_cna.csv')