# GSEA DATA

In [30]:
%pwd
%matplotlib inline

import pandas as pd
import numpy as np
import csv
import pickle
from imblearn.over_sampling import SMOTE

#Random Number
r = 42

## Load Data

In [18]:
#Load Expression and Metadata
df = pd.read_csv('Output/expression_matrix.txt', index_col=0).T
dfanno = pd.read_csv('Output/metadata.txt', index_col=0, header=None, squeeze = True)

#Load Genes
genes = pd.read_csv('Output/2000_ranked_genes.txt', squeeze=True, header=None)
genes

0          SMOX / 11741507_a_at
1         TMCC2 / 11721929_x_at
2         RBM38 / 11715594_a_at
3       CXORF22 / 11758541_s_at
4         SUMF1 / 11720732_a_at
                 ...           
1995       MEST / 11758268_s_at
1996     MRPS10 / 11758291_s_at
1997       GJB6 / 11733922_s_at
1998      CYTH3 / 11757447_s_at
1999       NOS2 / 11752035_a_at
Name: 0, Length: 2000, dtype: object

## Select Top 2000 Genes

In [19]:
#Choose Genes
df = df[genes]
df = df.astype(float)
df

Unnamed: 0,SMOX / 11741507_a_at,TMCC2 / 11721929_x_at,RBM38 / 11715594_a_at,CXORF22 / 11758541_s_at,SUMF1 / 11720732_a_at,TRIM10 / 11758611_s_at,SLC6A13 / 11759654_a_at,MKRN3 / 11736971_at,OMG / 11730727_at,RIPK2 / 11724236_a_at,...,SMG8 / 11720182_at,ARHGEF2 / 11749038_a_at,OR52E8 / 11738498_at,ERF / 11718263_a_at,HOXB1 / 11759812_at,MEST / 11758268_s_at,MRPS10 / 11758291_s_at,GJB6 / 11733922_s_at,CYTH3 / 11757447_s_at,NOS2 / 11752035_a_at
116_S_1249,0.807886,-0.941795,-0.165369,-1.571180,-1.277643,0.191825,0.166623,0.064653,2.393200,1.705883,...,0.704779,-1.363409,0.568343,-1.845089,0.188231,-0.341429,0.406144,0.614945,0.618424,-1.625951
037_S_4410,-2.097621,-0.679543,-1.102886,1.949596,-1.347309,-1.900826,-0.404871,1.290969,1.777766,1.349050,...,0.788917,-1.117208,1.997113,-1.152794,-0.472398,-0.581685,0.779213,0.934333,1.302728,-1.972464
006_S_4153,-0.864382,-0.737287,-0.249376,0.076363,-0.731320,-0.791592,0.752405,0.216220,-0.175261,-0.643264,...,-0.664376,0.578845,-0.042758,-0.239555,0.083671,-0.201745,1.372198,-0.981990,1.194465,-0.361822
116_S_1232,1.119981,0.930056,1.533250,1.580968,0.698655,1.524941,-0.258426,0.014131,-1.214029,-0.666392,...,0.417945,0.152096,2.694284,-0.077529,1.804158,1.183916,-1.247145,-0.554336,-0.235424,-0.483743
099_S_4205,-1.440825,-0.643454,-1.400271,-1.112275,-0.243661,-1.761131,-0.865638,2.214151,2.581744,-0.035327,...,0.207600,0.288875,-0.998140,-0.013700,0.121693,0.396101,-0.587400,-1.144391,1.541724,-0.926509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
009_S_2381,0.834670,1.567640,1.212343,-0.848969,-0.390326,0.943958,-0.362009,-0.119065,1.414909,-0.692824,...,-4.179049,0.119269,-0.628037,0.452739,1.162540,-0.609621,0.900952,0.322625,-0.163929,1.178234
053_S_4557,0.339744,-0.248873,0.362194,0.113978,-0.995315,1.017968,0.009462,-1.230557,-0.669743,0.456969,...,-0.109830,-0.520854,-0.731321,0.113957,0.164467,0.731341,0.013439,-0.310736,0.710345,1.531164
073_S_4300,1.281851,-0.268121,1.397159,0.309577,-0.082331,1.332514,-0.512026,-0.766670,0.194712,1.616675,...,0.238196,-1.215688,-1.230530,-0.607797,0.501911,0.787215,-0.744482,0.328038,0.291592,-0.393906
041_S_4014,-1.105440,-0.330677,-0.061201,0.271961,0.566658,0.065081,-0.136983,-0.348712,-0.883188,0.281857,...,-2.557480,0.600729,0.008885,-0.377032,2.003773,-0.011775,1.254386,1.389053,-2.306719,0.196449


## SMOTE Oversampling

In [20]:
#Variables and Acutal
feature_list = list(df.columns)
print(dfanno.value_counts())

#Oversampling
df, dfanno = SMOTE(random_state=r, n_jobs=-1).fit_resample(df, dfanno)
print(dfanno.value_counts())

CN      260
LMCI    226
EMCI    215
AD       43
Name: 1, dtype: int64
EMCI    260
CN      260
LMCI    260
AD      260
Name: 1, dtype: int64


## Sort

In [21]:
df = df.T
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039
SMOX / 11741507_a_at,0.807886,-2.097621,-0.864382,1.119981,-1.440825,0.860290,-0.941241,0.555183,1.505441,0.533057,...,-0.340778,1.454268,-0.221489,-0.535720,0.781523,-1.008846,0.755560,-1.350559,0.658684,-0.315920
TMCC2 / 11721929_x_at,-0.941795,-0.679543,-0.737287,0.930056,-0.643454,-0.020305,-0.953825,0.829005,-0.501501,-0.604958,...,-0.319962,0.424352,0.859460,0.682132,0.390739,-0.092731,0.393596,-0.565275,-0.459336,0.349047
RBM38 / 11715594_a_at,-0.165369,-1.102886,-0.249376,1.533250,-1.400271,0.726784,-0.975196,1.261067,0.750306,0.006005,...,-0.014997,0.637781,1.045923,0.816362,0.560388,-1.138785,0.563743,0.400354,0.872278,0.168388
CXORF22 / 11758541_s_at,-1.571180,1.949596,0.076363,1.580968,-1.112275,-1.029522,-0.194466,-0.397588,0.964080,0.527744,...,-1.217127,3.856393,0.100287,1.624328,-1.197034,0.095782,-1.176729,-1.356769,0.303156,-0.669320
SUMF1 / 11720732_a_at,-1.277643,-1.347309,-0.731320,0.698655,-0.243661,-1.424307,0.221997,-0.492990,0.188998,0.163331,...,-1.741639,0.999676,-0.831184,0.800618,-0.446848,-0.115051,-0.407658,-0.155828,-0.418780,-1.168726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MEST / 11758268_s_at,-0.341429,-0.581685,-0.201745,1.183916,0.396101,0.312291,1.200678,1.792937,-0.654320,-0.084411,...,-0.168221,4.060574,-0.628740,-0.310039,1.032032,0.024361,1.062796,-0.353163,-0.947126,-0.217759
MRPS10 / 11758291_s_at,0.406144,0.779213,1.372198,-1.247145,-0.587400,0.123396,-0.143643,-0.512786,-1.522038,0.771359,...,-0.215190,-2.589472,1.069344,-0.450999,0.744186,1.059798,0.772804,0.098979,-0.101742,-0.273829
GJB6 / 11733922_s_at,0.614945,0.934333,-0.981990,-0.554336,-1.144391,0.008651,0.625772,-1.323031,0.306385,0.885613,...,-0.441502,-0.645649,0.153210,0.732842,-0.345829,0.482597,-0.339107,-0.144384,-0.628022,-0.123261
CYTH3 / 11757447_s_at,0.618424,1.302728,1.194465,-0.235424,1.541724,0.602082,-0.078136,1.088245,0.865591,0.863548,...,-0.476463,-0.803385,-0.234879,1.108332,-0.184533,1.052919,-0.222465,0.798724,0.350076,0.087201


In [22]:
#Sort AD first
dfanno = dfanno.replace('AD', 0).replace('LMCI', 11).replace('EMCI', 11).replace('CN', 3)
dfanno = dfanno.sort_values()

dfanno = dfanno[dfanno<4]

#Binarize
dfanno = dfanno.replace(0, 0).replace(1, 1).replace(1, 2).replace(3, 1)

df = df[dfanno.index]

In [23]:
df

Unnamed: 0,858,900,901,789,788,902,787,786,903,899,...,199,256,201,257,207,222,221,255,204,227
SMOX / 11741507_a_at,-0.534207,-0.836481,-1.131483,-1.668093,0.393497,-0.777390,-1.688704,-0.011357,-1.364594,-0.733269,...,0.727533,-0.239028,-1.572417,0.168558,-0.551122,-0.346165,-0.814307,-0.070171,-0.080652,0.254733
TMCC2 / 11721929_x_at,-1.467301,-0.871943,-0.922696,-0.478792,-0.745354,-0.687640,-0.148263,-0.834096,-1.203366,-0.168416,...,1.796208,0.266006,0.203451,0.643745,-0.496689,0.472920,-0.609770,0.533070,-0.400450,1.596512
RBM38 / 11715594_a_at,-0.390813,-2.118132,-0.923891,-1.038733,-0.061991,-0.186392,1.071192,-0.215659,-0.958510,-0.447204,...,-0.355225,0.362194,-2.275622,0.506686,-0.395548,-1.507799,-1.306183,0.352113,0.311790,0.918319
CXORF22 / 11758541_s_at,-0.645587,0.210253,0.971347,0.975164,0.052644,0.322482,0.724388,-0.194835,1.576400,-0.103648,...,0.783527,2.024827,1.325185,0.347192,-0.284743,0.287008,-1.458335,-0.232081,-0.284743,-1.157414
SUMF1 / 11720732_a_at,-1.672771,-0.096621,-0.446848,-0.489783,-2.057086,-0.949213,-0.454610,-1.688837,-1.634364,0.774337,...,-0.206995,0.500659,0.713322,0.566658,0.053333,-1.570971,2.168963,0.401660,0.511659,1.013983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MEST / 11758268_s_at,0.792757,0.045632,0.098960,-0.393673,-0.956085,-0.277664,-0.743874,-0.692912,-0.568392,-0.489488,...,-1.162769,0.351402,2.597515,-0.268793,1.569443,0.815152,-0.699019,-1.017498,-1.051022,-0.905751
MRPS10 / 11758291_s_at,0.682710,0.513949,0.548920,0.074977,0.325943,-0.789877,-0.248948,-0.146758,-0.009560,-1.957214,...,0.429706,-1.494549,-0.100446,0.637840,1.144429,0.629985,0.044855,-1.156822,-0.131862,-0.775899
GJB6 / 11733922_s_at,-1.086974,-0.731784,-1.358596,-0.604160,1.664739,-0.771049,-0.053875,0.996943,-2.286669,-0.331469,...,-1.041537,0.598705,-0.354043,1.069666,1.767987,-1.793992,-0.245776,2.590815,-1.009057,-0.061722
CYTH3 / 11757447_s_at,0.791200,0.558442,-0.513730,-0.181360,1.629484,-0.697549,0.127242,0.733522,0.414157,-0.132509,...,-1.422231,1.345625,1.445717,-0.490761,0.301806,2.512005,-0.878874,-0.145545,-0.325302,-2.141261


In [24]:
dfanno

858    0
900    0
901    0
789    0
788    0
      ..
222    1
221    1
255    1
204    1
227    1
Name: 1, Length: 520, dtype: int64

In [25]:
dfanno.value_counts()

1    260
0    260
Name: 1, dtype: int64

In [26]:
df.index = pd.MultiIndex.from_tuples( df.index.str.split(' / ').tolist(), names=['Name', 'Description'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,858,900,901,789,788,902,787,786,903,899,...,199,256,201,257,207,222,221,255,204,227
Name,Description,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
SMOX,11741507_a_at,-0.534207,-0.836481,-1.131483,-1.668093,0.393497,-0.777390,-1.688704,-0.011357,-1.364594,-0.733269,...,0.727533,-0.239028,-1.572417,0.168558,-0.551122,-0.346165,-0.814307,-0.070171,-0.080652,0.254733
TMCC2,11721929_x_at,-1.467301,-0.871943,-0.922696,-0.478792,-0.745354,-0.687640,-0.148263,-0.834096,-1.203366,-0.168416,...,1.796208,0.266006,0.203451,0.643745,-0.496689,0.472920,-0.609770,0.533070,-0.400450,1.596512
RBM38,11715594_a_at,-0.390813,-2.118132,-0.923891,-1.038733,-0.061991,-0.186392,1.071192,-0.215659,-0.958510,-0.447204,...,-0.355225,0.362194,-2.275622,0.506686,-0.395548,-1.507799,-1.306183,0.352113,0.311790,0.918319
CXORF22,11758541_s_at,-0.645587,0.210253,0.971347,0.975164,0.052644,0.322482,0.724388,-0.194835,1.576400,-0.103648,...,0.783527,2.024827,1.325185,0.347192,-0.284743,0.287008,-1.458335,-0.232081,-0.284743,-1.157414
SUMF1,11720732_a_at,-1.672771,-0.096621,-0.446848,-0.489783,-2.057086,-0.949213,-0.454610,-1.688837,-1.634364,0.774337,...,-0.206995,0.500659,0.713322,0.566658,0.053333,-1.570971,2.168963,0.401660,0.511659,1.013983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MEST,11758268_s_at,0.792757,0.045632,0.098960,-0.393673,-0.956085,-0.277664,-0.743874,-0.692912,-0.568392,-0.489488,...,-1.162769,0.351402,2.597515,-0.268793,1.569443,0.815152,-0.699019,-1.017498,-1.051022,-0.905751
MRPS10,11758291_s_at,0.682710,0.513949,0.548920,0.074977,0.325943,-0.789877,-0.248948,-0.146758,-0.009560,-1.957214,...,0.429706,-1.494549,-0.100446,0.637840,1.144429,0.629985,0.044855,-1.156822,-0.131862,-0.775899
GJB6,11733922_s_at,-1.086974,-0.731784,-1.358596,-0.604160,1.664739,-0.771049,-0.053875,0.996943,-2.286669,-0.331469,...,-1.041537,0.598705,-0.354043,1.069666,1.767987,-1.793992,-0.245776,2.590815,-1.009057,-0.061722
CYTH3,11757447_s_at,0.791200,0.558442,-0.513730,-0.181360,1.629484,-0.697549,0.127242,0.733522,0.414157,-0.132509,...,-1.422231,1.345625,1.445717,-0.490761,0.301806,2.512005,-0.878874,-0.145545,-0.325302,-2.141261


## Create CLS File

In [27]:
#CLS file
f = open('GSEA/Diagnosis_polar.cls', 'w')
f.write(str(dfanno.size)+' 2 1\n# AD CTL\n'+' '.join(map(str, dfanno)))
f.close()


## Create Gene Expression Matrix

In [28]:
df.to_csv('GSEA/Genes_polar.txt', quoting=csv.QUOTE_MINIMAL, sep="\t")