In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dash_bio

In [183]:
#create pandas dataframe
path = "./RawData/tidy.csv"
pathMet = "./RawData/metadata.csv"
tidy = pd.read_csv(path, sep=",")
tidyMet = pd.read_csv(pathMet, sep=";", index_col=0)

#remove samples which are not in the metadata index column (quality controle etc)
tidy = tidy[ (tidy["R.FileName"].isin(tidyMet.index)) ]
tidyMer    = pd.merge(tidy, tidyMet, how="left", on="R.FileName")
tidySub = tidyMer[["R.FileName", "uniprot", "meanAbu", "Cancer"]]
tidySub.Cancer.value_counts()

Breast Cancer        84630
Lung Cancer          81900
Pancreatic Cancer    81900
Colorectal Cancer    81900
Healthy              81900
Prostate Cancer      81900
Name: Cancer, dtype: int64

In [185]:
tidySub.head()

Unnamed: 0,R.FileName,uniprot,meanAbu,Cancer
0,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6H9,11601.398438,Lung Cancer
1,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I0,3827.466797,Lung Cancer
2,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I1,114115.796875,Lung Cancer
3,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6I7,4293.524902,Lung Cancer
4,J_D201211_MDIA_P705_SExp01-BGSID-1_R01,A0A075B6J1,5969.785645,Lung Cancer


In [191]:
tidySub[(tidySub.uniprot == 'A0A075B6H9')&(tidySub.Cancer == 'Healthy')].head()

Unnamed: 0,R.FileName,uniprot,meanAbu,Cancer
19110,J_D201211_MDIA_P705_SExp01-BGSID-105_R01,A0A075B6H9,168020.15625,Healthy
30030,J_D201211_MDIA_P705_SExp01-BGSID-109_R01,A0A075B6H9,9181.585938,Healthy
46410,J_D201211_MDIA_P705_SExp01-BGSID-114_R01,A0A075B6H9,5460.259277,Healthy
60060,J_D201211_MDIA_P705_SExp01-BGSID-119_R01,A0A075B6H9,36868.132812,Healthy
65520,J_D201211_MDIA_P705_SExp01-BGSID-13_R01,A0A075B6H9,4813.011719,Healthy


For the pvalues, we need data stored in a list for each protein and cancertype 

In [232]:
ValueDF = tidySub.groupby(['uniprot','Cancer'])['meanAbu'].apply(list)

In [235]:
ValueDF = ValueDF.reset_index()

In [249]:
ValueDF[(ValueDF.uniprot == 'A0A075B6H9') & (ValueDF.Cancer == 'Healthy')].meanAbu

2    [168020.15625, 9181.5859375, 5460.25927734375,...
Name: meanAbu, dtype: object

In [256]:
ValueDF

Unnamed: 0,uniprot,Cancer,meanAbu
0,A0A075B6H9,Breast Cancer,"[35277.67578125, 34095.4765625, 3512.60546875,..."
1,A0A075B6H9,Colorectal Cancer,"[38555.5703125, 37487.71875, 28599.8984375, 42..."
2,A0A075B6H9,Healthy,"[168020.15625, 9181.5859375, 5460.25927734375,..."
3,A0A075B6H9,Lung Cancer,"[11601.3984375, 4950.046875, 4343.33154296875,..."
4,A0A075B6H9,Pancreatic Cancer,"[14946.3359375, 4416.61767578125, 30662.833984..."
...,...,...,...
16375,Q9Y6Z7,Colorectal Cancer,"[125766.78125, 101316.0546875, 92774.53125, 81..."
16376,Q9Y6Z7,Healthy,"[218686.546875, 66921.515625, 133171.328125, 7..."
16377,Q9Y6Z7,Lung Cancer,"[256844.078125, 99598.9375, 163073.59375, 1547..."
16378,Q9Y6Z7,Pancreatic Cancer,"[186635.421875, 242902.84375, 137199.046875, 5..."


In [269]:
from scipy import stats
x = ValueDF.iloc[2].meanAbu
y = ValueDF.iloc[0].meanAbu
f = stats.ttest_ind(x, y)

For the log2 values, we use MeanDF

In [201]:
MeanDF = tidySub.groupby(['uniprot','Cancer'])['meanAbu'].mean()

In [209]:
MeanDF = pd.DataFrame(MeanDF)
MeanDF = MeanDF.reset_index()
MeanDF['uniprot']

0        A0A075B6H9
1        A0A075B6H9
2        A0A075B6H9
3        A0A075B6H9
4        A0A075B6H9
            ...    
16375        Q9Y6Z7
16376        Q9Y6Z7
16377        Q9Y6Z7
16378        Q9Y6Z7
16379        Q9Y6Z7
Name: uniprot, Length: 16380, dtype: object

In [213]:
MeanDF[MeanDF.uniprot == 'A0A075B6H9']

Unnamed: 0,uniprot,Cancer,meanAbu
0,A0A075B6H9,Breast Cancer,13187.140263
1,A0A075B6H9,Colorectal Cancer,11832.107357
2,A0A075B6H9,Healthy,39325.718099
3,A0A075B6H9,Lung Cancer,19166.234025
4,A0A075B6H9,Pancreatic Cancer,30563.667741
5,A0A075B6H9,Prostate Cancer,12818.901481


In [219]:
pivMeanDF = MeanDF.pivot_table(values='meanAbu', index=['uniprot'], columns='Cancer')

In [216]:
MeanDF = MeanDF.reset_index()

In [225]:
pivMeanDF = pivMeanDF.reset_index()

In [228]:
breastcancer = pd.DataFrame(pivMeanDF[['uniprot', 'Breast Cancer', 'Healthy']]
                            )

In [283]:
ValueDF[(ValueDF.uniprot == 'A0A075B6H9') & (ValueDF.Cancer == 'Breast Cancer')]
ValueDF[(ValueDF.uniprot == 'A0A075B6H9') & (ValueDF.Cancer == 'Healthy')]

Unnamed: 0,uniprot,Cancer,meanAbu
2,A0A075B6H9,Healthy,"[168020.15625, 9181.5859375, 5460.25927734375,..."


In [315]:
len(x[:30])

30

In [328]:
stats.ttest_ind(x[:30], y).pvalue

0.019731415599478077

In [387]:
breastcancer = breastcancer.set_index('uniprot')

In [364]:
masks = (breastcancer['uniprot'] == 'A0A075B6H9')
breastcancer.loc[masks].pvalue.mask(masks, 1)
print(breastcancer.loc[masks].pvalue.values)

[None]


In [389]:
breastcancer.at['A0A075B6H9', 'pvalue'] = 'False'

In [393]:
from scipy import stats

x = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == 'Breast Cancer')].meanAbu.values[0]
y = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == 'Healthy')].meanAbu.values[0]
print(y)
breastcancer.at[protein, 'pvalue']  = stats.ttest_ind(x, y).pvalue

[168020.15625, 9181.5859375, 5460.25927734375, 36868.1328125, 4813.01171875, 3942.99755859375, 3867.3271484375, 6215.5244140625, 5632.58642578125, 4430.26953125, 4153.984375, 18559.1953125, 4144.66064453125, 9355.5283203125, 5622.3095703125, 5443.46875, 21316.87109375, 38519.265625, 16850.087890625, 102725.84375, 8596.5517578125, 2620.28100585938, 4085.49926757813, 54102.7109375, 63219.26171875, 89984.1875, 54763.09765625, 273791.625, 42764.18359375, 110721.078125]


In [397]:
for protein in breastcancer.index:
    x = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == 'Breast Cancer')].meanAbu.values[0]
    y = ValueDF[(ValueDF.uniprot == protein) & (ValueDF.Cancer == 'Healthy')].meanAbu.values[0]
    breastcancer.at[protein, 'pvalue']  = stats.ttest_ind(x, y).pvalue.float()

In [440]:
#type(stats.ttest_ind(x, y).pvalue)
type(breastcancer.at['A0A075B6H9', 'pvalue'])

numpy.float64

In [398]:
breastcancer.head()

Cancer,Breast Cancer,Healthy,L2F,pvalue
uniprot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A0A075B6H9,13187.140263,39325.718099,1.115172,0.021226
A0A075B6I0,36317.173757,176242.021566,1.150434,0.004225
A0A075B6I1,7822.352153,14905.960042,1.071924,0.05436
A0A075B6I7,4667.722373,7371.170549,1.054082,0.248206
A0A075B6J1,4698.46345,11839.818555,1.109312,0.037755


In [None]:
from math import log2
breastcancer['L2F'] = (breastcancer['Healthy'].apply(log2) / breastcancer['Breast Cancer'].apply(log2))

In [277]:
cancer

Unnamed: 0_level_0,Healthy,Cancer,L2F
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A075B6H9,3.932572e+04,1.751361e+04,1.082788
A0A075B6I0,1.762420e+05,3.702229e+04,1.148331
A0A075B6I1,1.490596e+04,1.110723e+04,1.031579
A0A075B6I7,7.371171e+03,4.779749e+03,1.051131
A0A075B6J1,1.183982e+04,4.709529e+03,1.109004
...,...,...,...
Q9Y6L6,5.230566e+03,4.947878e+03,1.006531
Q9Y6N7,2.968121e+04,2.049143e+04,1.037320
Q9Y6R7,4.056345e+06,4.759298e+06,0.989606
Q9Y6Y9,1.094369e+04,1.141427e+04,0.995494


In [25]:
cancer.L2F

variable
A0A075B6H9    1.082788
A0A075B6I0    1.148331
A0A075B6I1    1.031579
A0A075B6I7    1.051131
A0A075B6J1    1.109004
                ...   
Q9Y6L6        1.006531
Q9Y6N7        1.037320
Q9Y6R7        0.989606
Q9Y6Y9        0.995494
Q9Y6Z7        1.004126
Name: L2F, Length: 2730, dtype: float64

In [65]:
cancer.head()

Unnamed: 0_level_0,Healthy,Cancer,L2F
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A075B6H9,39325.718099,17513.610173,1.082788
A0A075B6I0,176242.021566,37022.292504,1.148331
A0A075B6I1,14905.960042,11107.229688,1.031579
A0A075B6I7,7371.170549,4779.749079,1.051131
A0A075B6J1,11839.818555,4709.528706,1.109004


In [108]:
# test if we only have two rows per protein
bdata[bdata['variable']=='A0A075B6H9']

Unnamed: 0,HasCancer,variable,value
0,False,A0A075B6H9,39325.718099
2730,True,A0A075B6H9,17484.958055


In [113]:
bdata.head()

Cancer         HasCancer  variable  
Breast Cancer  True       A0A075B6H9    13187.140263
                          A0A075B6I0    36317.173757
                          A0A075B6I1     7822.352153
                          A0A075B6I7     4667.722373
                          A0A075B6J1     4698.463450
Name: value, dtype: float64

In [110]:
bdata = data.melt(id_vars='Cancer')
bdata['HasCancer'] = bdata['Cancer']!='Healthy'

In [112]:
bdata = bdata.groupby(['Cancer','HasCancer', 'variable'])['value'].mean()

In [116]:
bdata=bdata.reset_index()
bdata = bdata.groupby(['HasCancer', 'variable'])['value'].apply(list)

In [122]:
bdata = bdata.reset_index()

In [124]:
bdata[bdata.variable=='A0A075B6H9']

Unnamed: 0,HasCancer,variable,value
0,False,A0A075B6H9,[39325.718098958336]
2730,True,A0A075B6H9,"[13187.140262726814, 11832.107356770834, 19166..."


In [53]:
del bdata['Cancer']

In [126]:
bdata = bdata[['variable', 'HasCancer', 'value']]

In [128]:
bdata[bdata.variable=='A0A075B6I1']

Unnamed: 0,variable,HasCancer,value
2,A0A075B6I1,False,[14905.960042317709]
2732,A0A075B6I1,True,"[7822.352153162802, 9022.135489908855, 11532.6..."


In [None]:
bdata.pivot_table(index= 'variable',
                   columns='HasCancer',
                   values='value')

In [142]:
from scipy import stats
print(stats.ttest_1samp([int(cancer.iloc[0][0])], popmean=int(cancer.iloc[0][1])))

Ttest_1sampResult(statistic=nan, pvalue=nan)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [414]:
from math import log10
breastcancer.pvalue.apply((log10))
breastcancer.pvalue.multiply(-1)

uniprot
A0A075B6H9   -0.021226
A0A075B6I0   -0.004225
A0A075B6I1    -0.05436
A0A075B6I7   -0.248206
A0A075B6J1   -0.037755
                ...   
Q9Y6L6       -0.339706
Q9Y6N7        -0.04136
Q9Y6R7       -0.118213
Q9Y6Y9       -0.995484
Q9Y6Z7       -0.709064
Name: pvalue, Length: 2730, dtype: object

In [444]:
breastcancer.pvalue.values

array([0.021226129911409028, 0.004225028876737513, 0.05435961919879539,
       ..., 0.11821295573629942, 0.9954835192088747, 0.7090635563859037],
      dtype=object)

In [None]:
from bioinfokit import analys, visuz
visuz.GeneExpression.volcano(df=indexres, lfc='L2F', pv='pvalue')

In [432]:
breastcancer.head()
type(breastcancer.pvalue[[1]])
breastcancer.pvalue[[1]]

uniprot
A0A075B6I0    0.004225
Name: pvalue, dtype: object

In [458]:
breastcancer = breastcancer.reset_index()

In [467]:
import joblib
joblib.dump(breastcancer, 'breastcancerdataframe')

['breastcancerdataframe']

In [468]:
dash_bio.VolcanoPlot(
    dataframe=breastcancer,
    effect_size='L2F',
    p='pvalue',
    snp = 'uniprot',
    gene=None
)

In [454]:
breastcancer = breastcancer.astype({"pvalue": float}, errors='raise') 

In [456]:
breastcancer.dtypes

Cancer
Breast Cancer    float64
Healthy          float64
L2F              float64
pvalue           float64
dtype: object