In [1]:
import pandas as pd
import numpy as np
from math import *
import matplotlib.pyplot as plt
import matplotlib
import matplotlib
import scipy.stats

# CES

Load the dataset containing mRNA experession (Z-score) for the CES genes

In [4]:
df_ces = pd.read_table('../data/colorectal/ces.txt')
df_ces.head()

Unnamed: 0,GENE_ID,COMMON,TCGA-A6-2675-01,TCGA-A6-5667-01,TCGA-AA-3655-01,TCGA-AA-3660-01,TCGA-AA-3662-01,TCGA-AA-3663-01,TCGA-AA-3697-01,TCGA-AA-3712-01,...,TCGA-AG-3586-01,TCGA-AG-A00Y-01,TCGA-AG-3599-01,TCGA-AF-6136-01,TCGA-EF-5831-01,TCGA-DY-A1DD-01,TCGA-G4-6317-02,TCGA-NH-A8F7-06,TCGA-NH-A6GC-06,Unnamed: 635
0,1058,CENPA,118.1389,317.965,312.471,462.3188,271.3924,254.9451,266.2539,319.5952,...,,,,145.1005,290.2193,183.3082,173.7452,220.1391,,
1,64105,CENPK,58.7997,249.7085,338.433,225.1208,259.2405,376.9231,190.0929,260.5671,...,,,,370.603,239.0041,104.1751,178.1577,232.2347,,
2,91687,CENPL,118.1389,191.203,216.0408,324.6377,213.1646,232.4176,210.5263,263.9401,...,,,,270.1005,257.4985,128.2156,65.6371,201.5926,,
3,79019,CENPM,135.9407,326.4441,544.7381,520.7729,143.7975,1159.3407,377.0898,428.7973,...,,,,172.1106,167.872,437.7359,184.7766,316.0972,,
4,55839,CENPN,332.2508,650.0689,616.815,647.8068,697.0987,734.6264,599.5046,832.5287,...,,,,882.4937,680.128,602.9287,506.5747,751.9444,,


Calculate CES for each patient

In [5]:
patients = []
for p in df_ces.columns[2:]:
    patients.append([p,df_ces[p].sum()])

patients = pd.DataFrame(patients)
patients.columns = ['id','ces']
patients.head()

Unnamed: 0,id,ces
0,TCGA-A6-2675-01,2249.6452
1,TCGA-A6-5667-01,4956.4514
2,TCGA-AA-3655-01,6215.7117
3,TCGA-AA-3660-01,6201.6134
4,TCGA-AA-3662-01,5130.7139


# MMR deficiency determination

The genes upreg are upregulated in MSS tumors, downreg are downregulated in MSS tumors

In [6]:
upreg = ['ANG','TFF2','SRSF6','SET','EIF5A','RARRES1' ,'CRIP1','CD55','IQGAP2','P4HA1',
         'TYMS','GLRX','NUCB2','FUT8','HNRNPL','PCCB','EIF5A','POLR2L','GTF2A2','GALNT1',
         'NDUFA9','TNNT1','AHR','PRKAR2B','ATP5A1','AMFR','OSTF1','CALM1','MANF','EWSR1',
         'SSR1','MFAP1','GNAQ','PPP2R5E','USP14','GCH1','KIF11','CASP2','SLC1A1','PGGT1B','RAB27B']
downreg = ['FTO','PLAGL1','STK38','DMD','NONO','CSNK1E','TCF7','POFUT1','OCRL','ELF1','LRP1','MLH1',
           'MARCKS','JUN','ELN','RAB32','TSC22D1','YWHAB','SEMA3C','GABRE','NEK3','PRKCI','TGFBR2',
           'FCGRT','CFTR','SERINC3','HSPH1','TDGF1','CYP2B7P']

In [8]:
mmr_patients = pd.read_table('../data/colorectal/mmr_expr.txt')
mmr_patients.head()

Unnamed: 0,GENE_ID,COMMON,TCGA-D5-6531-01,TCGA-DM-A28A-01,TCGA-DC-5869-01,TCGA-F4-6460-01,TCGA-DY-A1DC-01,TCGA-CM-5862-01,TCGA-CK-4947-01,TCGA-AH-6544-01,...,TCGA-QG-A5YW-01,TCGA-AF-6672-01,TCGA-F5-6464-01,TCGA-AA-3663-01,TCGA-G4-6299-01,TCGA-5M-AAT6-01,TCGA-CK-4950-01,TCGA-G4-6314-01,TCGA-F5-6810-01,Unnamed: 384
0,4292,MLH1,-0.6444,0.643,2.5549,0.0455,0.2859,-0.6856,1.2145,-0.7659,...,0.0743,-0.5902,-0.3142,-2.2311,-0.0654,0.0457,0.8024,1.9059,0.6958,
1,27030,MLH3,-0.4025,1.5304,-0.9405,0.0484,-1.0844,0.5904,-0.867,-0.7483,...,-0.2161,3.6458,0.1934,-1.542,-0.1282,-1.2747,-1.7482,-0.1349,-1.2631,
2,4436,MSH2,0.313,0.5116,2.7492,-0.6859,0.042,0.2577,1.5517,1.1196,...,-0.2916,-0.3317,-0.9564,-0.698,1.6207,-0.5009,0.8594,-0.2281,1.6499,
3,4437,MSH3,0.1619,-1.5667,0.46,0.0013,-0.1862,-0.8603,0.8837,0.4701,...,-0.2429,0.249,-0.809,-3.0072,0.6312,-0.7238,-1.0278,0.0033,-0.121,
4,2956,MSH6,1.2449,0.4288,1.2682,-1.2261,-1.1349,-0.6852,0.2927,1.73,...,-0.3467,-0.3567,-0.7693,-0.526,-0.4068,0.8173,1.6196,0.1003,0.6348,


Sum the expression scores for the upregulated and downregulated genes separately

In [9]:
mmr_scores = []
for p in mmr_patients.columns[2:]:
    up_score = 0
    for g in upreg:
        score = mmr_patients[mmr_patients['COMMON']==g][p]
        if not len(score)==1:
            score = np.mean(score)
        else:
            try:
                score = float(score)
            except:
                print (score)
        up_score += score
        
    down_score = 0
    for g in downreg:
        score = mmr_patients[mmr_patients['COMMON']==g][p]
        if not len(score)==1:
            score = np.mean(score)
        else:
            try:
                score = float(score)
            except:
                print(score)
        down_score+= score
        
    mmr_scores.append([p,up_score,down_score])  
    
mmr_scores = pd.DataFrame(mmr_scores)
mmr_scores.columns = ['id','upscore','downscore']
mmr_scores['msi_score'] = mmr_scores['upscore'] - mmr_scores['downscore']
mmr_scores.head()

Unnamed: 0,id,upscore,downscore,msi_score
0,TCGA-D5-6531-01,,,
1,TCGA-DM-A28A-01,,,
2,TCGA-DC-5869-01,,,
3,TCGA-F4-6460-01,,,
4,TCGA-DY-A1DC-01,,,


In [10]:
results = mmr_scores.merge(patients,how='inner',on='id')
results.head()

Unnamed: 0,id,upscore,downscore,msi_score,ces
0,TCGA-D5-6531-01,,,,6016.3981
1,TCGA-DM-A28A-01,,,,6701.2242
2,TCGA-DC-5869-01,,,,6909.5322
3,TCGA-F4-6460-01,,,,4800.8761
4,TCGA-DY-A1DC-01,,,,6244.2262


In [11]:
tmp = results[['ces','msi_score']].dropna()
low = np.percentile(tmp['ces'],25)
high = np.percentile(tmp['ces'],75)
l1  = list(tmp[tmp['ces']<low]['msi_score'])
l2  = list(tmp[tmp['ces']>high]['msi_score'])
plt.boxplot([l1,l2],labels=['Low CES','High CES'])
plt.ylabel('MMR score')
plt.title('Comparison of MMR score in low and high CES patients')
plt.show()


IndexError: cannot do a non-empty take from an empty axes.



Categorization of the patients/tumors. If the category is not obvious, we don't try to guess


In [None]:
for ix,p in results.iterrows():
    if p['upscore']>0 and p['downscore']<0:
        cat = 'msi'
        label = 0
    elif p['upscore']<0 and p['downscore']>0:
        cat = 'mss'
        label = 1
    else:
        cat = 'unknown'
        label = 2
    results = results.set_value(ix,'cat',cat)
    results = results.set_value(ix,'label',label)

results.head()



Only 10% of patients are not classified


In [None]:
float(np.sum(results['label']==2))/float(len(results['label']))

In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['upscore'], results['downscore'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('Up score')
plt.ylabel('Down score')
ax.grid(True, which='both')

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.show()

RED : MSI , GREEN : MSS

# CNA

Load the dataset

In [None]:
cna_mut = pd.read_table('data/colorectal/Mutation Count vs. CNA.txt').drop(['Patient ID','Group'],axis=1)
cna_mut.head()

In [None]:
results = results.merge(cna_mut, how='outer', left_on='id', right_on='Sample ID')
results.head()



CES / CNA correlation. Colors represent the MMR deficiency classification


In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['ces'], results['CNA'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
ax.grid(True, which='both')

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
plt.xlabel('CES')
plt.ylabel('CNA')
plt.show()

RED: MSI, GREEN : MSS

It appears the MSI patients have way less CNA than MSS patients


In [None]:
plt.boxplot([results[results['label']==1]['CNA'].dropna(),results[results['label']==0]['CNA'].dropna()],labels=['MSS','MSI'])
plt.ylabel('CNA')
plt.title('Comparison of CNA distribution in MSS and MSI patients')
plt.show()

In [None]:
plt.boxplot([results[results['cat']=='mss']['ces'],results[results['cat']=='msi']['ces']],labels=['MSS','MSI'])
plt.ylabel('CES')
plt.title('Comparison of CES distribution in MSS and MSI patients')
plt.show()

# Mutations

In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['ces'], results['Mutation Count'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('CES')
plt.ylabel('Mutation Count')
plt.show()

In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['CNA'], results['Mutation Count'])#,c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('CNA')
plt.ylabel('Mutation Count')
plt.show()

# MLH1 study

In [None]:
df_expr = pd.read_table('data/colorectal/mmr_expr.txt')
mlh1 = df_expr[df_expr['COMMON']=='MLH1'].transpose()[2:]
mlh1.columns=['mlh1']
mlh1.head()

In [None]:
df_meth = pd.read_table('data/colorectal/mmr_meth.txt')
mlh1_meth = df_meth[df_meth['COMMON']=='MLH1'].transpose()[2:]
mlh1_meth.columns=['mlh1_meth']
mlh1_meth.head()

In [None]:
results = results.merge(mlh1, how='outer', left_on='id', right_index=True)
results = results.merge(mlh1_meth, how='outer', left_on='id', right_index=True)
results.head()

## Comparison CES/MLH1

In [None]:
fig , ax = plt.subplots()
colors=['red','green','blue']
ax.scatter(results['ces'],results['mlh1'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('CES')
plt.ylabel('Expression level')
plt.title('CES in function of expression level of MLH1')
plt.show()



RED: MSI, GREEN : MSS, BLUE : unkown


In [None]:
low = np.percentile(results['ces'].dropna(),0.3)
high = np.percentile(results['ces'].dropna(),0.7)
l1  = list(results[results['ces']<low]['msh2'])
l2 = list(results[results['ces']>high]['msh2'])
plt.boxplot([l1,l2],labels=['Low CES','High CES'])
plt.ylabel('Expression level')
plt.title('Comparison of MLH1 expression level in low and high CES patients')
plt.show()
print('Pvalue of t-test on high and low: ', scipy.stats.ttest_ind(l1, l2))

In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['mlh1_meth'],results['mlh1'] , c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('Methylation')
plt.ylabel('Expression level')
plt.title('Expression level in function of methlyation of MLH1')
plt.show()

% of MSI tumors with an expression level of MLH1 under -1

In [None]:
float(np.sum(results[results['cat']=='msi']['mlh1']<-1))/float(len(results[results['cat']=='msi']['mlh1'].dropna()))

In [None]:
l1  = list(results[results['cat']=='mss']['mlh1'])
l2 = list(results[results['cat']=='msi']['mlh1'])
plt.boxplot([l1,l2],labels=['MSS','MSI'])
plt.ylabel('Expression level')
plt.title('Comparison of MLH1 expression level in MSS and MSI patients')
plt.show()

# MSH2 study

In [None]:
df_expr = pd.read_table('data/colorectal/mmr_expr.txt')
msh2 = df_expr[df_expr['COMMON']=='MSH2'].transpose()[2:]
msh2.columns=['msh2']
msh2.head()

In [None]:
df_meth = pd.read_table('data/colorectal/mmr_meth.txt')
msh2_meth = df_meth[df_meth['COMMON']=='MSH2'].transpose()[2:]
msh2_meth.columns=['msh2_meth']
msh2_meth.head()

In [None]:
results = results.merge(msh2, how='outer', left_on='id', right_index=True)
results = results.merge(msh2_meth, how='outer', left_on='id', right_index=True)
results.head()

## Comparison CES/MSH2

In [None]:
fig , ax = plt.subplots()
colors=['red','green','blue']
ax.scatter(results['ces'],results['msh2'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('CES')
plt.ylabel('Expression level')
plt.title('CES in function of expression level of MSH2')
plt.show()

In [None]:
import scipy
tmp = results[['ces','msh2']].dropna()
scipy.stats.spearmanr(tmp['ces'],tmp['msh2']).pvalue



RED: MSI, GREEN : MSS, BLUE : unkown


In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['msh2_meth'],results['msh2'] , c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('Methylation')
plt.ylabel('Expression level')
plt.title('Expression level in function of methlyation of MSH2')
plt.show()

% of MSI tumors with an expression level of MLH1 under -1

In [None]:
float(np.sum(results[results['cat']=='msi']['msh2']<-1))/float(len(results[results['cat']=='msi']['msh2'].dropna()))

In [None]:
l1  = list(results[results['cat']=='mss']['msh2'])
l2 = list(results[results['cat']=='msi']['msh2'])
plt.boxplot([l1,l2],labels=['MSS','MSI'])
plt.ylabel('Expression level')
plt.title('Comparison of MSH2 expression level in MSS and MSI patients')
plt.show()

# MSH6 study

In [None]:
df_expr = pd.read_table('data/colorectal/mmr_expr.txt')
msh2 = df_expr[df_expr['COMMON']=='PMS2'].transpose()[2:]
msh2.columns=['pms2']
msh2.head()

In [None]:
df_meth = pd.read_table('data/colorectal/mmr_meth.txt')
msh2_meth = df_meth[df_meth['COMMON']=='PMS2'].transpose()[2:]
msh2_meth.columns=['pms2_meth']
msh2_meth.head()

In [None]:
results = results.merge(msh2, how='outer', left_on='id', right_index=True)
results = results.merge(msh2_meth, how='outer', left_on='id', right_index=True)
results.head()

## Comparison CES/MSH2

In [None]:
fig , ax = plt.subplots()
colors=['red','green','blue']
ax.scatter(results['ces'],results['pms2'],c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('CES')
plt.ylabel('Expression level')
plt.title('CES in function of expression level of PMS2')
plt.show()

In [None]:
import scipy
tmp = results[['ces','pms2']].dropna()
scipy.stats.spearmanr(tmp['ces'],tmp['pms2']).pvalue



RED: MSI, GREEN : MSS, BLUE : unkown


In [None]:
fig, ax = plt.subplots()
colors = ['red','green','blue']
ax.scatter(results['pms2_meth'],results['pms2'] , c = results['label'],cmap = matplotlib.colors.ListedColormap(colors))
plt.xlabel('Methylation')
plt.ylabel('Expression level')
plt.title('Expression level in function of methlyation of PMS2')
plt.show()

% of MSI tumors with an expression level of MLH1 under -1

In [None]:
float(np.sum(results[results['cat']=='msi']['msh2']<-1))/float(len(results[results['cat']=='msi']['msh2'].dropna()))

In [None]:
l1  = list(results[results['cat']=='mss']['pms2'])
l2 = list(results[results['cat']=='msi']['pms2'])
plt.boxplot([l1,l2],labels=['MSS','MSI'])
plt.ylabel('Expression level')
plt.title('Comparison of PMS2 expression level in MSS and MSI patients')
plt.show()

In [None]:
df_ces = pd.read_table('data/broad_colorectal_ rnaseqv2/colorectal.txt')
df_ces.head()

In [None]:
df_ces['Hybridization REF'].unique()

In [None]:
genes = pd.read_csv('data/gene_list.csv')
mmr_genes = genes['MMR pathway'].dropna()
mmr_genes

In [None]:
l = []
for ix in df_ces['Hybridization REF'].unique():
    l.append(ix)

    print(len(pd.Series(l)))
len(pd.Series(l).unique())

In [None]:
l1  = list(results[results['cat']=='mss']['ces'])
l2 = list(results[results['cat']=='msi']['ces'])
plt.boxplot([l1,l2],labels=['MSS','MSI'])
plt.ylabel('CES')
plt.title('Comparison of CES score in MSS and MSI patients')
plt.show()

In [None]:
print('Pvalue of t-test on high and low: ', scipy.stats.ttest_ind(l1, l2))