In [6]:
import pandas as pd
from scipy.stats import ttest_rel,ttest_ind,wilcoxon,mannwhitneyu
from statsmodels.stats.multitest import multipletests

#read csv files and create DataFrames
healthy = pd.read_csv('data/lusc-rsem-fpkm-tcga_paired.txt', sep='\t')
cancer = pd.read_csv('data/lusc-rsem-fpkm-tcga-t_paired.txt', sep='\t')


In [7]:
print(healthy.shape)
print(cancer.shape)
#remove rows with many zeros
healthy =healthy[(healthy == 0).sum(1) < 25]
cancer =cancer[(cancer == 0).sum(1) < 25]
print(healthy.shape)
print(cancer.shape)
intersected_genes = set(healthy['Hugo_Symbol']).intersection(set(cancer['Hugo_Symbol']))
healthy =healthy[healthy['Hugo_Symbol'].isin(intersected_genes)]
cancer =cancer[cancer['Hugo_Symbol'].isin(intersected_genes)]
print(healthy.shape)
print(cancer.shape)


(19648, 52)
(19648, 52)
(17626, 52)
(17717, 52)
(17275, 52)
(17275, 52)


In [8]:
from scipy.stats import shapiro
healthy_nonNormal =0
cancer_nonNormal =0
for i in range(cancer.shape[0]):
    stat_h,p_h = shapiro(healthy.iloc[i,2:])
    stat_c,p_c = shapiro(cancer.iloc[i,2:])
    if p_h < 0.05:
        healthy_nonNormal+=1
    if p_c < 0.05:
        cancer_nonNormal+=1
print(healthy_nonNormal,cancer_nonNormal)

10281 15247


In [9]:
rel_val =[]
ind_val =[]
for i in range(cancer.shape[0]):
    #to make Samples paired
    p_val_rel = wilcoxon(healthy.iloc[i,2:].to_list(), cancer.iloc[i,2:].to_list()).pvalue
    #to make Samples indpendant
    p_val_ind = mannwhitneyu(healthy.iloc[i,2:].to_list(), cancer.iloc[i,2:].to_list()).pvalue 
    #append p values to lists
    rel_val.append(p_val_rel)
    ind_val.append(p_val_ind)

#Apply the FDR multiple tests correction method
p_relval_fdr = multipletests(rel_val, alpha=0.05, method='fdr_bh')[1]
p_indval_fdr = multipletests(ind_val, alpha=0.05, method='fdr_bh')[1]

#get the list of DEGs before and after the FDR correction for Samples paired
sign_paired = pd.DataFrame({'Hugo_Symbol':healthy['Hugo_Symbol'].tolist(), 'p_values':rel_val, 'p_values_fdr':p_relval_fdr})
sign_paired['significance:p_vlaue_fdr'] = sign_paired['p_values_fdr'].apply(lambda x: x < 0.05)
DEG_paired_fdr = sign_paired[sign_paired['significance:p_vlaue_fdr']== True]

sign_indg = pd.DataFrame({'Hugo_Symbol':healthy['Hugo_Symbol'].tolist(), 'p_values':ind_val, 'p_values_fdr':p_indval_fdr})
sign_indg['significance:p_vlaue_fdr'] = sign_indg['p_values_fdr'].apply(lambda x: x < 0.05)
DEG_ind_fdr = sign_indg[sign_indg['significance:p_vlaue_fdr']== True]



In [10]:
print(len(DEG_paired_fdr["Hugo_Symbol"]))
print(len(DEG_ind_fdr["Hugo_Symbol"]))

13141
13234
