# Tetrad Data

In [8]:
%pwd
%matplotlib inline

import pandas as pd
import numpy as np
import csv
from imblearn.over_sampling import SMOTE

#Random Number
r = 42

## Load Data

In [49]:
#Load Expression and Metadata
df = pd.read_csv('Output/expression_matrix.txt', index_col=0).T
dfanno = pd.read_csv('Output/metadata.txt', index_col=0, header=None, squeeze = True)
dfanno = dfanno.replace('AD', 0).replace('LMCI', 1).replace('EMCI', 2).replace('CN', 3)

#Load Genes
genes = pd.read_csv('Output/100_ranked_genes.txt', squeeze=True, header=0)
genes

0      SUMF1 / 11720732_a_at
1      MYEOV / 11755024_a_at
2       SMOX / 11741507_a_at
3      TMCC2 / 11721929_x_at
4        BMF / 11737438_a_at
               ...          
95     SRP14 / 11718663_s_at
96    ZNF271 / 11750558_s_at
97     LMOD2 / 11736713_a_at
98     PVRL3 / 11758970_a_at
99    ZNF471 / 11751506_a_at
Name: 0, Length: 100, dtype: object

## Select Top 100 Genes

In [50]:
#Choose Genes
df = df[genes]
df = df.astype(float)
df

Unnamed: 0,SUMF1 / 11720732_a_at,MYEOV / 11755024_a_at,SMOX / 11741507_a_at,TMCC2 / 11721929_x_at,BMF / 11737438_a_at,BCL2L11 / 11763147_x_at,AKR1C2 / 11738244_s_at,SLC6A13 / 11759654_a_at,KANSL3 / 11750658_x_at,BPIFA4P / 11750322_at,...,PPARD / 11735395_at,CYP4A11 || CYP4A22 / 11741459_s_at,TRIM10 / 11758611_s_at,TCF7L2 / 11747829_x_at,AGO3 / 11756925_a_at,SRP14 / 11718663_s_at,ZNF271 / 11750558_s_at,LMOD2 / 11736713_a_at,PVRL3 / 11758970_a_at,ZNF471 / 11751506_a_at
116_S_1249,-1.277643,-0.831099,0.807886,-0.941795,-0.573300,-0.448990,1.874984,0.166623,-0.528348,0.453522,...,-2.318080,-0.278232,0.191825,-1.675184,1.761589,0.500874,0.786248,0.395471,0.045678,1.078144
037_S_4410,-1.347309,-1.193127,-2.097621,-0.679543,-2.129096,-0.515147,-0.343558,-0.404871,-1.711251,-0.203923,...,-1.070732,-1.589246,-1.900826,-0.001264,3.326655,0.612513,1.977041,-0.710868,0.264972,-0.886165
006_S_4153,-0.731320,-0.360772,-0.864382,-0.737287,-2.221913,0.571223,-0.507894,0.752405,-0.198381,0.164246,...,0.726917,0.156812,-0.791592,-0.921778,0.505739,1.238660,-0.037046,1.185712,1.603651,-0.701289
116_S_1232,0.698655,0.369473,1.119981,0.930056,0.067581,-1.660709,-0.407466,-0.258426,0.474006,-1.038878,...,-1.176206,1.314973,1.524941,0.259312,0.798680,0.758128,-0.352046,2.050330,-0.144160,0.746907
099_S_4205,-0.243661,-0.654726,-1.440825,-0.643454,-0.714736,0.069822,2.285825,-0.865638,-0.297993,-0.795624,...,-1.139519,-2.171266,-1.761131,0.318792,0.326719,-0.159251,0.013067,0.953288,0.107866,0.600547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
009_S_2381,-0.390326,-0.651632,0.834670,1.567640,2.273098,3.377678,-1.347836,-0.362009,0.442877,-0.341986,...,-0.465401,1.632438,0.943958,-0.570567,-0.009621,0.563974,-0.242273,-0.562116,-1.256998,-0.169770
053_S_4557,-0.995315,1.152319,0.339744,-0.248873,1.720614,0.188208,-2.215167,0.009462,-1.350154,-0.881091,...,0.172031,-0.372296,1.017968,-1.428769,0.337569,0.850352,0.590566,0.990476,-0.687487,0.007403
073_S_4300,-0.082331,2.009429,1.281851,-0.268121,-0.560041,1.539206,0.240749,-0.512026,-0.291768,-0.177625,...,0.153687,0.832896,1.332514,0.174342,1.748027,0.364966,1.027270,0.878913,-0.550018,-2.018531
041_S_4014,0.566658,-0.382432,-1.105440,-0.330677,2.175861,0.167316,-0.133572,-0.136983,1.389199,1.761838,...,0.227061,0.932839,0.065081,-1.239002,-1.620798,1.374568,-0.449886,-0.729462,-0.913327,-0.239099


## SMOTE Oversampling

In [51]:
#Variables and Acutal
feature_list = list(df.columns)
print(dfanno.value_counts())

#Oversampling
df, dfanno = SMOTE(random_state=r, n_jobs=-1).fit_resample(df, dfanno)
print(dfanno.value_counts())

3    260
1    226
2    215
0     43
Name: 1, dtype: int64
3    260
2    260
1    260
0    260
Name: 1, dtype: int64


## Merge Data

In [61]:
tetdf = pd.concat([df, dfanno], axis=1).rename(columns={1: "Diagnosis"})
tetdf.columns = tetdf.columns.str.replace(' / ','_')

In [62]:
df

Unnamed: 0,SUMF1 / 11720732_a_at,MYEOV / 11755024_a_at,SMOX / 11741507_a_at,TMCC2 / 11721929_x_at,BMF / 11737438_a_at,BCL2L11 / 11763147_x_at,AKR1C2 / 11738244_s_at,SLC6A13 / 11759654_a_at,KANSL3 / 11750658_x_at,BPIFA4P / 11750322_at,...,PPARD / 11735395_at,CYP4A11 || CYP4A22 / 11741459_s_at,TRIM10 / 11758611_s_at,TCF7L2 / 11747829_x_at,AGO3 / 11756925_a_at,SRP14 / 11718663_s_at,ZNF271 / 11750558_s_at,LMOD2 / 11736713_a_at,PVRL3 / 11758970_a_at,ZNF471 / 11751506_a_at
0,-1.277643,-0.831099,0.807886,-0.941795,-0.573300,-0.448990,1.874984,0.166623,-0.528348,0.453522,...,-2.318080,-0.278232,0.191825,-1.675184,1.761589,0.500874,0.786248,0.395471,0.045678,1.078144
1,-1.347309,-1.193127,-2.097621,-0.679543,-2.129096,-0.515147,-0.343558,-0.404871,-1.711251,-0.203923,...,-1.070732,-1.589246,-1.900826,-0.001264,3.326655,0.612513,1.977041,-0.710868,0.264972,-0.886165
2,-0.731320,-0.360772,-0.864382,-0.737287,-2.221913,0.571223,-0.507894,0.752405,-0.198381,0.164246,...,0.726917,0.156812,-0.791592,-0.921778,0.505739,1.238660,-0.037046,1.185712,1.603651,-0.701289
3,0.698655,0.369473,1.119981,0.930056,0.067581,-1.660709,-0.407466,-0.258426,0.474006,-1.038878,...,-1.176206,1.314973,1.524941,0.259312,0.798680,0.758128,-0.352046,2.050330,-0.144160,0.746907
4,-0.243661,-0.654726,-1.440825,-0.643454,-0.714736,0.069822,2.285825,-0.865638,-0.297993,-0.795624,...,-1.139519,-2.171266,-1.761131,0.318792,0.326719,-0.159251,0.013067,0.953288,0.107866,0.600547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-0.513046,0.932485,1.444905,-0.623183,0.283435,0.213393,0.372722,1.985771,0.494685,0.575905,...,1.320209,-0.015292,0.095459,0.336434,0.268781,0.416433,-0.537021,0.535129,-0.266066,-0.664124
1036,1.347022,0.548939,0.651188,0.351867,0.163939,-1.338689,-0.562055,0.268027,1.273523,0.526398,...,-1.006822,0.703510,0.585285,-0.255360,-0.402383,-1.290057,-0.639967,0.204608,2.770609,1.256394
1037,0.576873,0.701692,-0.133977,-0.342164,-1.517907,0.347404,-0.053294,-0.902071,-0.740472,0.015679,...,-0.385098,-1.411396,0.036052,0.298300,-0.455373,-0.028432,-0.204574,-0.335539,-0.330697,1.699875
1038,0.674818,-0.593522,0.762048,0.391041,0.023577,0.344402,-0.205673,0.749034,-0.341878,-0.093041,...,0.442180,-0.003286,0.349556,-0.670281,0.279679,0.388060,1.383316,-0.991345,-0.133772,-0.156115


In [63]:
dfanno

0       3
1       3
2       0
3       3
4       2
       ..
1035    2
1036    2
1037    2
1038    2
1039    2
Name: 1, Length: 1040, dtype: int64

In [64]:
tetdf

Unnamed: 0,SUMF1_11720732_a_at,MYEOV_11755024_a_at,SMOX_11741507_a_at,TMCC2_11721929_x_at,BMF_11737438_a_at,BCL2L11_11763147_x_at,AKR1C2_11738244_s_at,SLC6A13_11759654_a_at,KANSL3_11750658_x_at,BPIFA4P_11750322_at,...,CYP4A11 || CYP4A22_11741459_s_at,TRIM10_11758611_s_at,TCF7L2_11747829_x_at,AGO3_11756925_a_at,SRP14_11718663_s_at,ZNF271_11750558_s_at,LMOD2_11736713_a_at,PVRL3_11758970_a_at,ZNF471_11751506_a_at,Diagnosis
0,-1.277643,-0.831099,0.807886,-0.941795,-0.573300,-0.448990,1.874984,0.166623,-0.528348,0.453522,...,-0.278232,0.191825,-1.675184,1.761589,0.500874,0.786248,0.395471,0.045678,1.078144,3
1,-1.347309,-1.193127,-2.097621,-0.679543,-2.129096,-0.515147,-0.343558,-0.404871,-1.711251,-0.203923,...,-1.589246,-1.900826,-0.001264,3.326655,0.612513,1.977041,-0.710868,0.264972,-0.886165,3
2,-0.731320,-0.360772,-0.864382,-0.737287,-2.221913,0.571223,-0.507894,0.752405,-0.198381,0.164246,...,0.156812,-0.791592,-0.921778,0.505739,1.238660,-0.037046,1.185712,1.603651,-0.701289,0
3,0.698655,0.369473,1.119981,0.930056,0.067581,-1.660709,-0.407466,-0.258426,0.474006,-1.038878,...,1.314973,1.524941,0.259312,0.798680,0.758128,-0.352046,2.050330,-0.144160,0.746907,3
4,-0.243661,-0.654726,-1.440825,-0.643454,-0.714736,0.069822,2.285825,-0.865638,-0.297993,-0.795624,...,-2.171266,-1.761131,0.318792,0.326719,-0.159251,0.013067,0.953288,0.107866,0.600547,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,-0.513046,0.932485,1.444905,-0.623183,0.283435,0.213393,0.372722,1.985771,0.494685,0.575905,...,-0.015292,0.095459,0.336434,0.268781,0.416433,-0.537021,0.535129,-0.266066,-0.664124,2
1036,1.347022,0.548939,0.651188,0.351867,0.163939,-1.338689,-0.562055,0.268027,1.273523,0.526398,...,0.703510,0.585285,-0.255360,-0.402383,-1.290057,-0.639967,0.204608,2.770609,1.256394,2
1037,0.576873,0.701692,-0.133977,-0.342164,-1.517907,0.347404,-0.053294,-0.902071,-0.740472,0.015679,...,-1.411396,0.036052,0.298300,-0.455373,-0.028432,-0.204574,-0.335539,-0.330697,1.699875,2
1038,0.674818,-0.593522,0.762048,0.391041,0.023577,0.344402,-0.205673,0.749034,-0.341878,-0.093041,...,-0.003286,0.349556,-0.670281,0.279679,0.388060,1.383316,-0.991345,-0.133772,-0.156115,2


## Save Data

In [65]:
tetdf.to_csv('Tetrad/TetradData.csv', quoting=csv.QUOTE_MINIMAL, index=False)