In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dash_bio
from math import log2, log10
from scipy import stats

In [2]:
#create pandas dataframe
path = "./RawData/tidy.csv"
pathMet = "./RawData/metadata.csv"
tidy = pd.read_csv(path, sep=",")
tidyMet = pd.read_csv(pathMet, sep=";", index_col=0)

#remove samples which are not in the metadata index column (quality controle etc)
tidy = tidy[ (tidy["R.FileName"].isin(tidyMet.index)) ]
tidyMer    = pd.merge(tidy, tidyMet, how="left", on="R.FileName")
tidySub = tidyMer[["R.FileName", "uniprot", "meanAbu", "Cancer"]]
tidySub.Cancer.value_counts()

Breast Cancer        84630
Lung Cancer          81900
Pancreatic Cancer    81900
Colorectal Cancer    81900
Healthy              81900
Prostate Cancer      81900
Name: Cancer, dtype: int64

In [3]:
tidySub.head()

Unnamed: 0,R.FileName,uniprot,meanAbu,Cancer
0,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6H9,11601.398438,Lung Cancer
1,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I0,3827.466797,Lung Cancer
2,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I1,114115.796875,Lung Cancer
3,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I7,4293.524902,Lung Cancer
4,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6J1,5969.785645,Lung Cancer


For the pvalues, we need data stored in a list for each protein and cancertype 

In [4]:
#important for pvalues!
ValueDF = tidySub.groupby(['uniprot','Cancer'])['meanAbu'].apply(list)
ValueDF = ValueDF.reset_index()

For the log2 values, we use MeanDF

In [5]:
MeanDF = tidySub.groupby(['uniprot','Cancer'])['meanAbu'].mean()
MeanDF = pd.DataFrame(MeanDF)
MeanDF = MeanDF.reset_index()

In [6]:
# each protein has a mean abu
MeanDF[MeanDF.uniprot == 'A0A075B6H9']

Unnamed: 0,uniprot,Cancer,meanAbu
0,A0A075B6H9,Breast Cancer,13187.140263
1,A0A075B6H9,Colorectal Cancer,11832.107357
2,A0A075B6H9,Healthy,39325.718099
3,A0A075B6H9,Lung Cancer,19166.234025
4,A0A075B6H9,Pancreatic Cancer,30563.667741
5,A0A075B6H9,Prostate Cancer,12818.901481


In [7]:
pivMeanDF = MeanDF.pivot_table(values='meanAbu', index=['uniprot'], columns='Cancer')
pivMeanDF = pivMeanDF.reset_index()

In [8]:
d = {}
for items in pivMeanDF.columns[1:]:
    if items == 'Healthy':
        next
    else:
        print(items)
        name = (str(items.replace(" ", "")).lower())
        d[name] = pd.DataFrame(pivMeanDF[['uniprot', items, 'Healthy']])
        
        d[name] = d[name].set_index('uniprot')
        
        for protein in d[name].index:
        
            x = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == items)].meanAbu.values[0]
            y = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == 'Healthy')].meanAbu.values[0]
            d[name].at[protein, 'pvalue']  = stats.ttest_ind(x, y).pvalue
            d[name].pvalue.apply((log10))
            d[name].pvalue.multiply(-1)
            
            d[name]['L2F'] = (d[name][items].apply(log2) / d[name]['Healthy'].apply(log2))
            

Breast Cancer
Colorectal Cancer
Lung Cancer
Pancreatic Cancer
Prostate Cancer


In [9]:
d['breastcancer'].reset_index()

Cancer,uniprot,Breast Cancer,Healthy,pvalue,L2F
0,A0A075B6H9,1.318714e+04,3.932572e+04,0.021226,0.896723
1,A0A075B6I0,3.631717e+04,1.762420e+05,0.004225,0.869237
2,A0A075B6I1,7.822352e+03,1.490596e+04,0.054360,0.932902
3,A0A075B6I7,4.667722e+03,7.371171e+03,0.248206,0.948693
4,A0A075B6J1,4.698463e+03,1.183982e+04,0.037755,0.901460
...,...,...,...,...,...
2725,Q9Y6L6,4.778257e+03,5.230566e+03,0.339706,0.989437
2726,Q9Y6N7,2.217552e+04,2.968121e+04,0.041360,0.971692
2727,Q9Y6R7,5.230733e+06,4.056345e+06,0.118213,1.016711
2728,Q9Y6Y9,1.095217e+04,1.094369e+04,0.995484,1.000083


In [10]:
print(d['breastcancer'])
print(d['lungcancer'])

Cancer      Breast Cancer       Healthy    pvalue       L2F
uniprot                                                    
A0A075B6H9   1.318714e+04  3.932572e+04  0.021226  0.896723
A0A075B6I0   3.631717e+04  1.762420e+05  0.004225  0.869237
A0A075B6I1   7.822352e+03  1.490596e+04  0.054360  0.932902
A0A075B6I7   4.667722e+03  7.371171e+03  0.248206  0.948693
A0A075B6J1   4.698463e+03  1.183982e+04  0.037755  0.901460
...                   ...           ...       ...       ...
Q9Y6L6       4.778257e+03  5.230566e+03  0.339706  0.989437
Q9Y6N7       2.217552e+04  2.968121e+04  0.041360  0.971692
Q9Y6R7       5.230733e+06  4.056345e+06  0.118213  1.016711
Q9Y6Y9       1.095217e+04  1.094369e+04  0.995484  1.000083
Q9Y6Z7       1.646895e+05  1.536092e+05  0.709064  1.005832

[2730 rows x 4 columns]
Cancer       Lung Cancer       Healthy    pvalue       L2F
uniprot                                                   
A0A075B6H9  1.916623e+04  3.932572e+04  0.115261  0.932065
A0A075B6I0  2.6958

In [11]:
import joblib
joblib.dump(d, 'volcanoplot_data')

['volcanoplot_data']

In [16]:

dash_bio.VolcanoPlot(
    dataframe=d['breastcancer'].reset_index(),
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None,
    logp=True)


In [14]:

dash_bio.VolcanoPlot(
    dataframe=d['colorectalcancer'].reset_index(),
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None,
    logp=True)


In [18]:

dash_bio.VolcanoPlot(
    dataframe=d['pancreaticcancer'].reset_index(),
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None,
    logp=True)


In [17]:

dash_bio.VolcanoPlot(
    dataframe=d['lungcancer'].reset_index(),
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None,
    logp=True)


In [19]:

dash_bio.VolcanoPlot(
    dataframe=d['prostatecancer'].reset_index(),
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None,
    logp=True)


In [19]:
for frames in d:
    print(frames)

breastcancer
colorectalcancer
lungcancer
pancreaticcancer
prostatecancer
