In [None]:
!pip install -q cptac

In [None]:
# import mnodules
import cptac
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
from scipy.stats import shapiro

import warnings
warnings.filterwarnings("ignore")
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})


In [None]:
# download endometrial 
cptac.download(dataset="Endometrial")

In [None]:
# Creates a new Endometrial class
endo = cptac.Endometrial()

# Load the proteomics dataframe
df_prot = endo.get_proteomics()
print(df_prot.shape)
df_prot.head()


In [None]:
# read in the desired gene list from the text file (make sure its here)
des_genes = pd.read_csv('filtered_genes_list.txt', header = None)

# only retain the desired genes in the DF
df_prot_edit = df_prot[des_genes[0]]
df_prot_edit.shape
df_prot_edit.head()

In [None]:
# save as excel (gene levels per patients)
df_prot_edit.to_excel("patients_genes.xlsx") 
# this sheet is provided as a link to google sheets in the write up

In [None]:
# get clinical characteristics
df_clin = endo.get_clinical()
df_clin.to_excel("patients_clinical.xlsx") 
# this sheet is provided as a link to google sheets in the write up

In [None]:
# apply Shapiro test to each gene to checj if it is normally distributed
# the p-value used for hypothesis testing = 0.1 (Royston, P. (1995). Remark AS R94: A Remark on Algorithm AS 181: The W-test for Normality. Appl. Stat. 44, 547.)

p = 0.1
res = df_prot_edit.apply(lambda x: shapiro(x), axis=0)
re1 = res.apply(lambda x: '{:<20.3e}'.format(x[0]))
re2 = res.apply(lambda x: '{:<20.3e}'.format(x[1]))
re3 = res.apply(lambda x: '{:}'.format(x[1] > p))
total_df = pd.DataFrame(columns = ['Name', 't-test', 'p-value','Sig'])
total_df['Name'] = df_prot_edit.columns
total_df['t-test'] = np.asarray(re1)
total_df['p-value'] = np.asarray(re2)
total_df['Sig'] = np.asarray(re3)

total_df.head()

In [None]:
total_df.Sig.value_counts()
# 56 are not normally distributed and 22 are

In [None]:
# plot the distribution of each gene 
# also title the results of the Shapiro tests
for j in range(13):
  fig, ax = plt.subplots(1,6,  figsize=(18, 4), sharey = True)
  cols_todo = df_prot_edit.columns.to_list()

  for i in range(6):
    sns.distplot(df_prot_edit[cols_todo[i+(j*6)]], ax = ax[i])
    str_title = 'Shapiro test \n p-val = '+ (total_df.loc[total_df['Name'] == cols_todo[i+(j*6)], 'p-value'].iloc[0]) + '\n' + (total_df.loc[total_df['Name'] == cols_todo[i+(j*6)], 'Sig'].iloc[0])
    ax[i].set_title(str_title)
  plt.show()

In [None]:
# make a heatmap
sns.heatmap(df_prot_edit.corr(method = 'spearman'), cmap="PiYG")

In [None]:
# make a clustermap
plt.figure(figsize=(16, 16))
sns.clustermap(df_prot_edit.corr(method = 'spearman'), cmap="PiYG",  xticklabels=True, yticklabels=True, linewidths=0.5)

In [None]:
g = sns.clustermap(df_prot_edit.corr(method = 'spearman'), cmap="PiYG",  xticklabels=True, yticklabels=True, linewidths=0.5)

In [None]:
df_prot_edit.columns[g.dendrogram_col.reordered_ind]