# Cancer Genes

In [15]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 

In [3]:
df = pd.read_csv('gene_high_throughput_sequencing.csv', header = 0)
print(df.shape)
df.head()

(72, 15750)


Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
0,STT5425_Breast_001_normal,normal,1.257614,2.408148,13.368622,9.494779,20.880435,12.722017,9.494779,54.349694,...,4.76125,1.257614,1.257614,1.257614,1.257614,1.257614,23.268694,1.257614,1.257614,1.257614
1,STT5427_Breast_023_normal,normal,4.567931,16.602734,42.477752,25.562376,23.221137,11.622386,14.330573,72.445474,...,6.871902,1.815112,1.815112,1.815112,1.815112,1.815112,10.427023,1.815112,1.815112,1.815112
2,STT5430_Breast_002_normal,normal,2.077597,3.978294,12.863214,13.728915,14.543176,14.141907,6.23279,57.011005,...,7.096343,2.077597,2.077597,2.077597,2.077597,2.077597,22.344226,2.077597,2.077597,2.077597
3,STT5439_Breast_003_normal,normal,2.066576,8.520713,14.466035,7.823932,8.520713,2.066576,10.870009,53.292034,...,5.20077,2.066576,2.066576,2.066576,2.066576,2.066576,49.295538,2.066576,2.066576,2.066576
4,STT5441_Breast_004_normal,normal,2.613616,3.434965,12.682222,10.543189,26.688686,12.484822,1.364917,67.140393,...,11.22777,1.364917,1.364917,1.364917,1.364917,1.364917,23.627911,1.364917,1.364917,1.364917


In [11]:
df_n = df[df['Diagnosis'] == 'normal'].drop(['Patient_id', 'Diagnosis'], axis = 1)
df_en = df[df['Diagnosis'] == 'early neoplasia'].drop(['Patient_id', 'Diagnosis'], axis = 1)
df_c = df[df['Diagnosis'] == 'cancer'].drop(['Patient_id', 'Diagnosis'], axis = 1)

In [16]:
diff_n_en = 0
diff_c_en = 0
for feature in df_n.columns:
    if stats.ttest_ind(df_n[feature], df_en[feature], equal_var = False)[1] <= 0.05:
        diff_n_en += 1
    if stats.ttest_ind(df_c[feature], df_en[feature], equal_var = False)[1] <= 0.05:
        diff_c_en += 1
print(diff_n_en, diff_c_en)

1575 3490


In [18]:
df_calculs = pd.DataFrame(df_n.columns, columns = ['Genes'])

In [35]:
df_calculs['Mean Normal'] = np.mean(df_n.values, axis = 0)
df_calculs['Mean Neoplasia'] = np.mean(df_en.values, axis = 0)
df_calculs['Mean Cancer'] = np.mean(df_c.values, axis = 0)

In [47]:
def fold_change(a, b):
    return(np.maximum(a, b) / np.minimum(a, b))
df_calculs['Fold NN'] = fold_change(df_calculs['Mean Normal'].values, df_calculs['Mean Neoplasia'].values)
df_calculs['Fold NC'] = fold_change(df_calculs['Mean Cancer'].values, df_calculs['Mean Neoplasia'].values)

In [49]:
p_value_n_en = []
p_value_c_en = []
for feature in df_n.columns:
    p_n_en = stats.ttest_ind(df_n[feature], df_en[feature], equal_var = False)[1]
    p_c_en = stats.ttest_ind(df_c[feature], df_en[feature], equal_var = False)[1]
    p_value_n_en.append(p_n_en)
    p_value_c_en.append(p_c_en)
df_calculs['P-value NN'] = p_value_n_en
df_calculs['P-value CN'] = p_value_c_en
print(len(df_calculs[df_calculs['P-value NN'] <= 0.05]), len(df_calculs[df_calculs['P-value CN'] <= 0.05]))

1575 3490


In [60]:
df_calculs['PV-Holm NN'] = multipletests(df_calculs['P-value NN'].values, alpha = 0.025, method = 'holm')[1]
df_calculs['PV-Holm CN'] = multipletests(df_calculs['P-value CN'].values, alpha = 0.025, method = 'holm')[1]
df_calculs['PV-BH NN'] = multipletests(df_calculs['P-value NN'].values, alpha = 0.025, method = 'fdr_bh')[1]
df_calculs['PV-BH CN'] = multipletests(df_calculs['P-value CN'].values, alpha = 0.025, method = 'fdr_bh')[1]

In [63]:
print(len(df_calculs[(df_calculs['Fold NN'] >= 1.5) & (df_calculs['PV-Holm NN'] <= 0.025)]), 
      len(df_calculs[(df_calculs['Fold NC'] >= 1.5) & (df_calculs['PV-Holm CN'] <= 0.025)]),
      len(df_calculs[(df_calculs['Fold NN'] >= 1.5) & (df_calculs['PV-BH NN'] <= 0.025)]),
      len(df_calculs[(df_calculs['Fold NC'] >= 1.5) & (df_calculs['PV-BH CN'] <= 0.025)]))

2 77 4 524
