In [None]:
import pandas as pd
clinical_df = pd.read_excel("../input/1-s2.0-S0092867420301070-mmc1.xlsx")


In [None]:
proteomics_dfs = pd.read_excel('../input/1-s2.0-S0092867420301070-mmc2.xlsx', sheet_name= None)

global_df = proteomics_dfs['A-global-proteomics']
phospho_df = proteomics_dfs['B-phospho-proteomics']

In [None]:
# First, read the table of case numbers, and filter out the ones that are not excluded 
# Filter out the excluded case
print("Total number of cases:", len(clinical_df))
case_df = clinical_df[clinical_df['Case_excluded'] == 'No']
case_df = case_df[case_df['Histologic_type'].isin(['Endometrioid','Serous'])]
print("Tumor cases:", len(case_df))

serous_df = case_df[case_df['Histologic_type'] == 'Serous']
endometrioid_df = case_df[case_df['Histologic_type'] == 'Endometrioid']
print("Serous cases:", len(serous_df))
print("Endometrioid cases:", len(endometrioid_df))

Total number of cases: 153
Tumor cases: 95
Serous cases: 12
Endometrioid cases: 83


Q2-easy: What is the average age of patients with serous tumor samples analyzed in the study?

In [None]:
#serous_df['Age'].to_list()
# average age of serous cases
serous_age = serous_df['Age'].mean()
print("Average age of serous cases:", serous_age)

Average age of serous cases: 68.5


Q1-hard: What is the correlation between the abundance of the protein PLK1 and the abundance of CHEK2-S163 in tumor samples? Exclude samples not in the study and with missing values.

In [None]:
from scipy.stats import spearmanr
import numpy as np

# Filter the global proteomics data for tumor cases 
case_ids = case_df['idx'].tolist()
global_df = global_df.filter(items=case_ids+["idx"], axis=1)
phospho_df = phospho_df.filter(items=case_ids+["idx"], axis=1)

plk1 = global_df[global_df['idx'] == 'PLK1'].values.tolist()[0][:-1]
chek2 = phospho_df[phospho_df['idx'] == 'CHEK2-S163'].values.tolist()[0][:-1]

plk1 = np.asarray(plk1)[~np.isnan(chek2)]
chek2 = np.asarray(chek2)[~np.isnan(chek2)]

# Calculate the Spearman correlation
spearman_corr, p_value = spearmanr(plk1, chek2)
print("Spearman correlation coefficient:", spearman_corr)
print("P-value:", p_value)

Spearman correlation coefficient: 0.4764813684787027
P-value: 0.0004084391725761019


Q3-hard: What is the age of the patient associated with the lowest APM-Z score?

In [None]:
tmb_df = pd.read_excel('../input/1-s2.0-S0092867420301070-mmc7.xlsx', sheet_name=["B-APM subtypes"])['B-APM subtypes']
# find the idx where APP_Z_score is max
min_idx = tmb_df.loc[tmb_df['APP_Z_score'].idxmin()]['idx']

# find the idx where APP_Z_score is max
case_df[case_df['idx']==min_idx]['Age']

18    60.0
Name: Age, dtype: float64

Q5-hard: What is the median number of variants per Mbp for the serous tumor samples in the study? (Note: the variable is expressed in log2 in the input table)


In [None]:
import numpy as np
tmb_df = tmb_df[tmb_df['idx'].isin(serous_cases)]
vpm = tmb_df['Log2_variant_per_Mbp'].values
vpm = 2**vpm
np.median(vpm)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


np.float64(nan)

Q6-easy: What the most common FIGO stage of the samples taken amongst patients 70 or above?

In [None]:
above_70 = case_df[case_df['Age'] >= 70]
above_70['FIGO_stage'].value_counts()

FIGO_stage
IA       12
IB        4
IIIA      2
IVB       2
II        2
IIIC1     1
Name: count, dtype: int64

Q7-hard: How many are the significant genes by acetylproteomics? (parse value from spreadsheet README)

In [None]:
dfs = pd.read_excel('../input/1-s2.0-S0092867420301070-mmc3.xlsx', sheet_name=None)
gene_readme = dfs['README']

acetyl_sheet = gene_readme.loc[gene_readme["Description"].str.contains("acetylproteomics", case=False), "Sheet"].to_list()[0]
acetyl_sheet

# Since the sheet is technically a list, the first item gets considered a header and we have to add 1
num_genes = len(dfs[acetyl_sheet])+1
display(dfs[acetyl_sheet].head()) 
print("Number of genes in acetyl sheet:", num_genes)

Unnamed: 0,BRD8
0,DHX15
1,SSB
2,FUS
3,PARP1
4,TRIM33


Number of genes in acetyl sheet: 16


Hard question: Which proteins found in both comparisons are targeted by FDA-approved drugs?

Q9-easy: What is the difference between the average false discovery rate (FDR) in CBX3 genes and the average FDR for the rest of the genes?

In [None]:
fdr_df = dfs['F-SS-phospho']
fdr_df[['Gene','FDR.phos']]

# fdr_df.groupby('FDR.phos').mean()
gene_mean = fdr_df.groupby('Gene')['FDR.phos'].mean()
cbx3 = gene_mean['CBX3']

other_genes = fdr_df[fdr_df['Gene'] != 'CBX3']
other = np.mean(other_genes['FDR.phos'])

print("Mean FDR for CBX3:", cbx3)
print("Mean FDR for other genes:", other)
print("Difference in mean FDR:", cbx3 - other)

Mean FDR for CBX3: 0.016840668972299792
Mean FDR for other genes: 0.02484464434616611
Difference in mean FDR: -0.008003975373866316


Hard question: 