In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu, hypergeom
import warnings
warnings.filterwarnings('ignore')

In [2]:
from rpy2 import robjects
from rpy2.robjects import Formula

from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.packages import importr

base = importr("base")
stats = importr("stats")
DESeq2 = importr("DESeq2")

# Все 35 трехпетельных белков человека 

In [3]:
upar = pd.read_csv('human_upar.tsv', index_col=0, sep='\t')
upar = set(upar['Approved symbol'])
print(len(upar))

35


# Функция подсчета дифференциальной экспрессии 

In [4]:
def deseq(meta, counts, formula, ref):
    meta["Tissue"] = stats.relevel(robjects.vectors.FactorVector(meta["Tissue"]), ref=ref)

    # Calculate normalization factors
    dds = DESeq2.DESeqDataSetFromMatrix(countData=counts, colData=meta, design=Formula(formula))
    dds = DESeq2.DESeq(dds)

    res = DESeq2.results(dds, name=f"Tissue_AD_vs_{ref}")
    res = DESeq2.lfcShrink(dds, coef=f"Tissue_AD_vs_{ref}", type="apeglm")
    res = pd.DataFrame(base.as_data_frame(res))
    res.index = counts.index
    res = res.sort_values("padj")
    res = res.loc[res["padj"] < 0.05]
    res = res.loc[res["log2FoldChange"].abs() > 0.5]

    return res

# Проект GSE159699 

In [5]:
counts = pd.read_csv('GSE159699/GSE159699_summary_count.star.txt', index_col=0, sep='\t').reindex(upar).dropna()
print(counts.shape)
counts

(33, 30)


Unnamed: 0_level_0,21-1A-AD,20-1T-AD,23-2A-AD,22-2T-AD,26-3A-AD,24-3T-AD,27-5A-AD,25-5T-AD,29-6T-AD,31-7T-AD,...,10-8A-Old,17-9A-Old,2-12A-Young,4-13A-Young,6-14A-Young,8-15A-Young,9-16A-Young,3-17T-Young,5-18T-Young,7-19T-Young
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LY6E,693.0,1139.0,961.0,719.0,1111.0,908.0,819.0,527.0,1124.0,1082.0,...,1638.0,1278.0,2206.0,1965.0,1605.0,292.0,1785.0,404.0,643.0,1463.0
PSCA,8.0,12.0,9.0,18.0,5.0,4.0,5.0,7.0,14.0,22.0,...,11.0,10.0,10.0,13.0,3.0,7.0,8.0,10.0,16.0,16.0
LY6G6D,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,3.0,...,0.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0
LYPD2,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,5.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0
LY6G6E,4.0,0.0,2.0,2.0,0.0,0.0,3.0,6.0,5.0,6.0,...,0.0,2.0,1.0,1.0,0.0,0.0,1.0,4.0,1.0,1.0
CD177,69.0,20.0,88.0,5.0,5.0,4.0,75.0,62.0,19.0,31.0,...,7.0,17.0,20.0,25.0,74.0,5.0,31.0,7.0,25.0,65.0
LYPD8,18.0,12.0,11.0,7.0,7.0,8.0,10.0,1.0,10.0,19.0,...,92.0,9.0,13.0,19.0,17.0,1.0,22.0,1.0,5.0,15.0
LY6G5B,34.0,54.0,47.0,75.0,42.0,64.0,31.0,67.0,43.0,32.0,...,61.0,42.0,37.0,27.0,33.0,39.0,67.0,55.0,72.0,56.0
LY6K,1.0,3.0,4.0,10.0,3.0,1.0,2.0,5.0,5.0,3.0,...,2.0,2.0,11.0,7.0,4.0,3.0,9.0,3.0,1.0,0.0
PATE1,10.0,13.0,9.0,2.0,7.0,6.0,5.0,5.0,3.0,6.0,...,12.0,9.0,9.0,5.0,14.0,4.0,10.0,1.0,4.0,5.0


In [6]:
print('Генов белков, которых нет в проекте', set(upar) - set(counts.index))

Генов белков, которых нет в проекте {'LY6L', 'SLURP2'}


In [7]:
counts.columns.str.split('-').str[-1].value_counts()

AD       12
Old      10
Young     8
dtype: int64

In [8]:
old_df = counts.iloc[:, :22]
meta = pd.DataFrame({"Tissue": ['AD']*12 + ['Old']*10}, index=old_df.columns)
old = deseq(meta, old_df, "~ Tissue", 'Old')
old.columns = 'AD_Old_' + old.columns
old = old[['AD_Old_log2FoldChange', 'AD_Old_padj']]
old

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Unnamed: 0_level_0,AD_Old_log2FoldChange,AD_Old_padj
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1
LYNX1,-0.627832,0.000355
PINLYP,0.729596,0.008559


In [9]:
young_df = pd.concat([counts.iloc[:, :12], counts.iloc[:, -8:]], axis=1)
meta = pd.DataFrame({"Tissue": ['AD']*12 + ['Young']*8}, index=young_df.columns)
young = deseq(meta, young_df, "~ Tissue", 'Young')
young.columns = 'AD_Young_' + young.columns
young = young[['AD_Young_log2FoldChange', 'AD_Young_padj']]
young

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Unnamed: 0_level_0,AD_Young_log2FoldChange,AD_Young_padj
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1
LYNX1,-0.513935,0.042246
LY6G6F,1.481236,0.047474
SLURP1,1.136846,0.047474
LY6G6C,1.191178,0.049386
GPIHBP1,-1.145622,0.049386


In [10]:
alls_df = counts.copy()
meta = pd.DataFrame({"Tissue": ['AD']*12 + ['Normal']*18}, index=alls_df.columns)
alls = deseq(meta, alls_df, "~ Tissue", 'Normal')
alls.columns = 'AD_Young_Old_' + alls.columns
alls = alls[['AD_Young_Old_log2FoldChange', 'AD_Young_Old_padj']]
alls

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 1 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

R[write to console]: using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Unnamed: 0_level_0,AD_Young_Old_log2FoldChange,AD_Young_Old_padj
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1
LYNX1,-0.579286,0.000472
PINLYP,0.651884,0.002305


In [11]:
control_df = counts.iloc[:, 12:]
meta = pd.DataFrame({"Tissue": ['AD']*10 + ['control']*8}, index=control_df.columns)
control = deseq(meta, control_df, "~ Tissue", 'control')
control.columns = 'Young_Old_' + control.columns
control = control[['Young_Old_log2FoldChange', 'Young_Old_padj']]
control

R[write to console]: converting counts to integer mode

R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895



Unnamed: 0_level_0,Young_Old_log2FoldChange,Young_Old_padj
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1
GPIHBP1,-3.264609,0.000158


In [12]:
rv = hypergeom(35, 2, 33)
pval = sum([rv.pmf(i) for i in range(2)])
print("p-value on two sets intersection: {}".format(pval))

p-value on two sets intersection: 0.11260504201680659


In [13]:
counts.shape, len(counts)

((33, 30), 33)

In [14]:
result = alls.join([old, young, control], how='outer')
# result.to_csv('results/GSE159699_result.tsv', sep='\t')
result

Unnamed: 0_level_0,AD_Young_Old_log2FoldChange,AD_Young_Old_padj,AD_Old_log2FoldChange,AD_Old_padj,AD_Young_log2FoldChange,AD_Young_padj,Young_Old_log2FoldChange,Young_Old_padj
refGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LYNX1,-0.579286,0.000472,-0.627832,0.000355,-0.513935,0.042246,,
PINLYP,0.651884,0.002305,0.729596,0.008559,,,,
LY6G6F,,,,,1.481236,0.047474,,
SLURP1,,,,,1.136846,0.047474,,
LY6G6C,,,,,1.191178,0.049386,,
GPIHBP1,,,,,-1.145622,0.049386,-3.264609,0.000158


# Проект http://aging.brain-map.org 

fpkm = pd.read_csv('atlas/fpkm_table_unnormalized.csv', index_col=0)
fpkm

In [15]:
# Загрузка дополнительных данных для разделения пациентов на группы
descr = pd.read_csv('atlas/DonorInformation.csv', index_col=0)
donor_id = pd.read_csv('atlas/columns-samples.csv', index_col=0)
genes = pd.read_csv('atlas/rows-genes.csv', index_col=0).gene_symbol.to_dict()

col = 'dsm_iv_clinical_diagnosis'
descr = descr[descr[col].isin(['No Dementia', "Alzheimer's Disease Type"])][col].to_dict()
donor_id['info'] = donor_id.donor_id.apply(descr.get)
donor_id = (donor_id['info'] + ' ' + donor_id.structure_acronym).to_dict()

In [16]:
fpkm = pd.read_csv('atlas/fpkm_table_normalized.csv', index_col=0)
fpkm.index = fpkm.index.to_series().apply(genes.get)
fpkm.columns = fpkm.columns.to_series().apply(lambda x: donor_id.get(int(x)))
list_col = fpkm.columns.value_counts().index.tolist()
fpkm = fpkm.reindex(upar).dropna()[list_col]
print(fpkm.shape)
fpkm

(33, 303)


Unnamed: 0_level_0,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,No Dementia TCx,...,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP,Alzheimer's Disease Type HIP
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LY6E,22.918028,27.240175,23.991783,27.453322,33.140609,32.492562,21.142665,33.265643,16.71051,22.758431,...,43.195189,37.067208,54.950306,23.891853,57.16227,25.881895,36.740634,53.764085,33.837886,32.681924
PSCA,0.269912,0.07409,0.136017,0.153479,0.051768,0.0,0.0,0.301602,0.0,0.233485,...,0.0,0.405437,0.112111,0.271024,0.094993,0.144299,0.24492,0.132796,0.083741,0.0
LY6G6D,0.0,0.0,0.0,0.0,0.187253,0.0,0.0,0.0,0.204145,0.0,...,0.177076,0.0,0.266843,0.0,0.0,0.441102,0.0,0.0,0.0,0.0
LYPD2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LY6G6E,0.0,0.183509,0.177435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111791,0.0,0.0,0.0,0.100845,0.104409,0.0
CD177,0.071813,0.230368,0.429323,0.81277,0.76497,0.0,0.106813,0.204424,0.0,1.15757,...,0.372084,0.160759,0.180664,0.160328,0.658789,0.150611,0.045172,1.058162,0.165173,0.0
LYPD8,3.923255,4.854096,3.73229,2.661808,3.624684,2.641943,2.272594,4.398042,4.186323,3.105917,...,1.198632,2.353861,3.523385,1.276924,1.626367,2.442905,1.495296,3.066478,2.018164,2.237088
LY6G5B,2.663793,2.183498,2.142098,3.279771,2.257528,6.0673,1.974146,2.386403,1.399098,3.525474,...,1.188258,2.920764,1.178813,2.410148,2.657672,2.05411,2.508792,1.284646,2.034928,1.317973
LY6K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177126,0.0,0.0,...,0.229196,0.07467,0.185037,0.0,0.117071,0.120363,0.0,0.0,0.382235,0.094116
PATE1,0.257974,0.361722,0.242484,0.312346,0.329072,0.207149,0.394759,0.26128,0.453124,0.432517,...,0.0,0.126699,0.197645,0.191386,0.513541,0.104598,0.15448,0.27655,0.190553,0.277453


In [17]:
for name in fpkm.columns.str.split().str[-1].unique():
    for i in fpkm.index:
        print(i, name)
        data = fpkm.loc[i][[f"Alzheimer's Disease Type {name}", f"No Dementia {name}"]].reset_index()
        sns.histplot(data=data, x=i, #alpha=0.6,
                     hue='index', bins=10, stat='density',
                     kde=True)
        plt.tight_layout()
        plt.savefig(f'figures/{i}_{name}.png')
        plt.close()

LY6E TCx
PSCA TCx
LY6G6D TCx
LYPD2 TCx
LY6G6E TCx
CD177 TCx
LYPD8 TCx
LY6G5B TCx
LY6K TCx
PATE1 TCx
SPACA4 TCx
LY6D TCx
PINLYP TCx
PLAUR TCx
GML TCx
PATE4 TCx
LY6H TCx
LY6G5C TCx
SLURP1 TCx
LY6G6F TCx
LYPD4 TCx
PATE2 TCx
LYPD6 TCx
TEX101 TCx
GPIHBP1 TCx
LYPD1 TCx
CD59 TCx
LYPD3 TCx
PATE3 TCx
LYNX1 TCx
LY6G6C TCx
LYPD5 TCx
LYPD6B TCx
LY6E HIP
PSCA HIP
LY6G6D HIP
LYPD2 HIP
LY6G6E HIP
CD177 HIP
LYPD8 HIP
LY6G5B HIP
LY6K HIP
PATE1 HIP
SPACA4 HIP
LY6D HIP
PINLYP HIP
PLAUR HIP
GML HIP
PATE4 HIP
LY6H HIP
LY6G5C HIP
SLURP1 HIP
LY6G6F HIP
LYPD4 HIP
PATE2 HIP
LYPD6 HIP
TEX101 HIP
GPIHBP1 HIP
LYPD1 HIP
CD59 HIP
LYPD3 HIP
PATE3 HIP
LYNX1 HIP
LY6G6C HIP
LYPD5 HIP
LYPD6B HIP
LY6E FWM
PSCA FWM
LY6G6D FWM
LYPD2 FWM
LY6G6E FWM
CD177 FWM
LYPD8 FWM
LY6G5B FWM
LY6K FWM
PATE1 FWM
SPACA4 FWM
LY6D FWM
PINLYP FWM
PLAUR FWM
GML FWM
PATE4 FWM
LY6H FWM
LY6G5C FWM
SLURP1 FWM
LY6G6F FWM
LYPD4 FWM
PATE2 FWM
LYPD6 FWM
TEX101 FWM
GPIHBP1 FWM
LYPD1 FWM
CD59 FWM
LYPD3 FWM
PATE3 FWM
LYNX1 FWM
LY6G6C FWM
LYPD5 FWM
LYPD6B

In [18]:
print('Генов белков, которых нет в проекте', set(upar) - set(fpkm.index))

Генов белков, которых нет в проекте {'LY6L', 'SLURP2'}


In [19]:
# Группы пациентов по болезни и области мозга
pval=0.06
inds = fpkm.columns.value_counts()
inds

No Dementia TCx                 50
No Dementia HIP                 50
No Dementia FWM                 47
No Dementia PCx                 46
Alzheimer's Disease Type TCx    29
Alzheimer's Disease Type PCx    28
Alzheimer's Disease Type FWM    28
Alzheimer's Disease Type HIP    25
dtype: int64

In [20]:
descr = pd.read_csv('atlas/DonorInformation.csv', index_col=0)
descr.head()

Unnamed: 0_level_0,name,age,sex,apo_e4_allele,education_years,age_at_first_tbi,longest_loc_duration,cerad,num_tbi_w_loc,dsm_iv_clinical_diagnosis,control_set,nincds_arda_diagnosis,ever_tbi_w_loc,race,hispanic,act_demented,braak,nia_reagan
donor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
326765665,H14.09.078,87,M,N,16,0,Unknown or N/A,0,0,No Dementia,31,No Dementia,N,White,Not Hispanic,No Dementia,1,1
326765656,H14.09.069,95-99,M,N,17,12,1-2 min,2,1,No Dementia,26,No Dementia,Y,White,Not Hispanic,No Dementia,5,2
326765654,H14.09.067,85,M,Y,10,72,< 10 sec,3,1,Vascular,25,"Dementia, Type Unknown",Y,White,Not Hispanic,Dementia,4,2
467056391,H15.09.103,90-94,F,N,11,87,< 10 sec,0,1,No Dementia,52,No Dementia,Y,White,Not Hispanic,No Dementia,4,0
309335447,H14.09.010,100+,M,Y,16,0,Unknown or N/A,3,0,Alzheimer's Disease Type,28,Possible Alzheimer'S Disease,N,White,Not Hispanic,Dementia,4,2


In [21]:
pd.read_csv('atlas/columns-samples.csv', index_col=0)

Unnamed: 0_level_0,donor_id,donor_name,specimen_id,specimen_name,rna_well_id,polygon_id,structure_id,structure_acronym,structure_color,structure_name,hemisphere
rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
488395315,309335467,H14.09.030,309357843,H14.09.030.TCx.01,395325172,320817998,10235,TCx,#ebbfd0,temporal neocortex,left
496100277,309335441,H14.09.004,309357624,H14.09.004.PCx.01,320630866,310967169,10557,FWM,#f2f1f0,white matter of forebrain,right
496100278,309335438,H14.09.001,309357596,H14.09.001.PCx.01,320630834,310790571,10557,FWM,#f2f1f0,white matter of forebrain,left
496100279,309335438,H14.09.001,309357599,H14.09.001.TCx.01,320630838,310790522,10235,TCx,#ebbfd0,temporal neocortex,left
496100281,309335439,H14.09.002,309357603,H14.09.002.HIP.01,320630842,310790372,10294,HIP,#bfb5d5,hippocampus (hippocampal formation),right
...,...,...,...,...,...,...,...,...,...,...,...
496100667,467056391,H15.09.103,467179071,H15.09.103.TCx.01,482655826,480366830,10235,TCx,#ebbfd0,temporal neocortex,right
496100669,467056391,H15.09.103,467179068,H15.09.103.PCx.01,482655822,480363830,10557,FWM,#f2f1f0,white matter of forebrain,right
496100670,467056406,H15.09.107,467179104,H15.09.107.TCx.01,482655780,480363840,10235,TCx,#ebbfd0,temporal neocortex,right
496100671,467056391,H15.09.103,467179065,H15.09.103.HIP.01,482655820,480366825,10294,HIP,#bfb5d5,hippocampus (hippocampal formation),right


# t-test 

In [22]:
df = fpkm[['No Dementia HIP', "Alzheimer's Disease Type HIP"]]
n = 50
ttest = pd.DataFrame([ttest_ind(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index], index=df.index, columns=["t-test", 'p-value'])
ttest[ttest['p-value'] < pval].sort_values('p-value')

Unnamed: 0_level_0,t-test,p-value
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PSCA,-2.375157,0.02017
PATE1,1.944189,0.055726


In [23]:
df = fpkm[['No Dementia TCx', "Alzheimer's Disease Type TCx"]]
n = 50
ttest = pd.DataFrame([ttest_ind(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index], index=df.index, columns=["t-test", 'p-value'])
ttest[ttest['p-value'] < pval].sort_values('p-value')

Unnamed: 0_level_0,t-test,p-value
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TEX101,2.681051,0.008974
LYPD5,-2.47085,0.015689
SPACA4,-2.138316,0.035662
LY6K,-1.997161,0.049341
LY6G5B,1.977067,0.051614


In [24]:
df = fpkm[['No Dementia FWM', "Alzheimer's Disease Type FWM"]]
n = 47
ttest = pd.DataFrame([ttest_ind(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index], index=df.index, columns=["t-test", 'p-value'])
ttest[ttest['p-value'] < pval].sort_values('p-value')

Unnamed: 0_level_0,t-test,p-value
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GML,-3.316151,0.001424
LYPD8,-2.48514,0.015241
PATE3,-2.46897,0.015891
LYPD5,-2.425325,0.017768
PATE4,-2.399699,0.018961
LY6G6F,-2.19693,0.0312
LYPD4,2.094322,0.039703
LY6G6E,-1.994868,0.049791
GPIHBP1,1.940284,0.056208


In [25]:
df = fpkm[['No Dementia PCx', "Alzheimer's Disease Type PCx"]]
n = 46
ttest = pd.DataFrame([ttest_ind(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index],
                     index=df.index, columns=["t-test", 'p-value'])
ttest[ttest['p-value'] < pval].sort_values('p-value')

Unnamed: 0_level_0,t-test,p-value
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
LYPD5,-2.429738,0.017605


# mannwhitneyu 

In [26]:
df = fpkm[['No Dementia HIP', "Alzheimer's Disease Type HIP"]]
n = 50
mw = pd.DataFrame([mannwhitneyu(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index],
                  index=df.index, columns=["mw_hip", 'p-value_hip'])
hip = mw[mw['p-value_hip'] < pval].sort_values('p-value_hip')
hip

Unnamed: 0_level_0,mw_hip,p-value_hip
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PSCA,447.0,0.021471
LYPD8,454.0,0.027666
PATE1,468.0,0.039295
LYPD1,472.0,0.043269


In [27]:
df = fpkm[['No Dementia TCx', "Alzheimer's Disease Type TCx"]]
n = 50
mw = pd.DataFrame([mannwhitneyu(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index],
                  index=df.index, columns=["mw_tcx", 'p-value_tcx'])
tcx = mw[mw['p-value_tcx'] < pval].sort_values('p-value_tcx')
tcx

Unnamed: 0_level_0,mw_tcx,p-value_tcx
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TEX101,463.5,0.002637
LYPD5,486.0,0.007638
SPACA4,605.0,0.033923
LY6G5B,547.0,0.03551
LYNX1,555.0,0.042356
LYPD6,556.0,0.043282
LY6K,570.5,0.044648


In [28]:
df = fpkm[['No Dementia FWM', "Alzheimer's Disease Type FWM"]]
n = 47
mw = pd.DataFrame([mannwhitneyu(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index], index=df.index,
                  columns=["mw_fwm", 'p-value_fwm'])
fwm = mw[mw['p-value_fwm'] < pval].sort_values('p-value_fwm')
fwm

Unnamed: 0_level_0,mw_fwm,p-value_fwm
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GML,414.0,0.000377
LYPD5,414.0,0.003824
LYPD8,437.0,0.007841
PATE3,439.0,0.008315
LY6G6E,479.0,0.009106
PATE4,451.0,0.011851
LYPD4,451.0,0.011851
LYNX1,453.0,0.012545
LY6G6F,524.0,0.015725
GPIHBP1,462.0,0.016117


In [29]:
df = fpkm[['No Dementia PCx', "Alzheimer's Disease Type PCx"]]
n = 46
mw = pd.DataFrame([mannwhitneyu(df.loc[gene].iloc[:n], df.loc[gene].iloc[n:]) for gene in df.index],
                  index=df.index, columns=["mw_pcx", 'p-value_pcx'])
pcx = mw[mw['p-value_pcx'] < pval].sort_values('p-value_pcx')
pcx

Unnamed: 0_level_0,mw_pcx,p-value_pcx
gene_id \ rnaseq_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
LYPD5,441.0,0.012005
GPIHBP1,496.0,0.050091


In [30]:
res.columns

NameError: name 'res' is not defined

In [None]:
res = hip.join([fwm, tcx, pcx], how='outer')
print(res.shape)
# res.to_csv('results/atlas_result.tsv', sep='\t')
cols = ['mw_hip', 'mw_fwm', 'mw_tcx', 'mw_pcx',
        'AD_Young_Old_log2FoldChange', 'AD_Young_Old_padj',
       'AD_Old_log2FoldChange', 'AD_Young_log2FoldChange',
       'Young_Old_log2FoldChange',
       'Young_Old_padj']
res = res.join(result, how='inner').drop(cols, axis=1)
res.index