In [1]:
import os
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

import pandas as pd
import numpy as np

# Load data

In [2]:
data_file = 'GSE152800_exons.counts.tsv'
df_count = pd.read_csv(data_file, sep='\t', index_col=0)
count_df = df_count.transpose()
count_df.head()

Geneid,ENSMUSG00000051951.5,ENSMUSG00000102851.1,ENSMUSG00000103377.1,ENSMUSG00000104017.1,ENSMUSG00000103025.1,ENSMUSG00000103201.1,ENSMUSG00000103147.1,ENSMUSG00000103161.1,ENSMUSG00000102331.1,ENSMUSG00000102348.1,...,ENSMUSG00000064363.1,ENSMUSG00000064364.1,ENSMUSG00000064365.1,ENSMUSG00000064366.1,ENSMUSG00000064367.1,ENSMUSG00000064368.1,ENSMUSG00000064369.1,ENSMUSG00000064370.1,ENSMUSG00000064371.1,ENSMUSG00000064372.1
MBD725,6901,121,1443,1087,282,612,468,1679,1771,57,...,130517,35,30,27,195867,29760,876,298981,70,1282
MBD726,5073,89,977,723,281,535,302,1258,1273,50,...,68153,10,26,12,107045,16756,311,172418,41,755
MBD731,5072,77,1091,909,322,504,308,1342,1310,38,...,98206,13,19,6,145749,21671,249,237163,28,977
MBD732,5675,94,1131,969,363,565,366,1329,1455,47,...,113934,20,33,17,176437,26418,332,271334,50,1055
MBD743,4195,84,816,717,178,393,220,836,814,11,...,96585,25,56,17,160838,24645,286,213127,33,699


In [3]:
sample_info = pd.read_csv('sample_info.csv')
sample_info.head()

Unnamed: 0,sample,age,genotype
0,MBD725,15,MM2
1,MBD726,15,WT
2,MBD731,15,MM2
3,MBD732,15,WT
4,MBD743,12,WT


In [4]:
count_df['sample_name'] = count_df.index.values

count_df_ko = count_df[count_df['sample_name'].str.startswith('MM')].copy()
count_df_ko.drop(columns='sample_name', inplace=True)

count_df_mm2 = count_df[count_df['sample_name'].str.startswith('MBD')].copy()
count_df_mm2.drop(columns='sample_name', inplace=True)

In [5]:
# gene ID to gene
idmap = pd.read_excel('idmap.xlsx', index_col=0)

id_to_gene = {gene_id:gene_symbol for (gene_id, gene_symbol) in 
             zip(idmap.index.values, idmap['symbol'].values)}

# Calculate DEG for KO

In [6]:
clinical_df_ko = sample_info[sample_info['sample'].str.startswith('MM')].copy()
clinical_df_ko = clinical_df_ko.loc[:, ['sample', 'genotype']].copy()
clinical_df_ko.set_index('sample', inplace=True)
clinical_df_ko

Unnamed: 0_level_0,genotype
sample,Unnamed: 1_level_1
MM180,KO
MM181,KO
MM360,WT
MM361,WT
MM362,WT
MM363,KO
MM464,KO


In [7]:
### filter by gene count
count_thresh = 100
gene_average = np.average(count_df_ko, axis=0)
sel_genes = count_df_ko.columns[np.where(gene_average>=count_thresh)]
count_df_filtered = count_df_ko.loc[:, sel_genes]

dds = DeseqDataSet(
            counts=count_df_filtered,
            clinical=clinical_df_ko,
            design_factors="genotype",  # compare samples based on the "condition"
            refit_cooks=True,
            n_cpus=16,
        )

dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=16)

stat_res.summary()
result_df = stat_res.results_df

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 5.62 seconds.

Fitting dispersion trend curve...
... done in 3.97 seconds.

Fitting MAP dispersions...
... done in 6.54 seconds.

Fitting LFCs...
... done in 0.64 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 0.71 seconds.

Log2 fold change & Wald test p-value: genotype WT vs KO


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000051951.5,4596.516113,0.034803,0.076375,0.455683,0.648618,0.819599
ENSMUSG00000103377.1,877.362244,-0.097277,0.060957,-1.595841,0.110524,0.287010
ENSMUSG00000104017.1,769.458984,-0.009866,0.095561,-0.103249,0.917766,0.964744
ENSMUSG00000103025.1,235.795822,0.068847,0.131009,0.525513,0.599227,0.790626
ENSMUSG00000103201.1,407.578339,0.250048,0.153107,1.633156,0.102436,0.272549
...,...,...,...,...,...,...
ENSMUSG00000064367.1,165491.828125,-0.118229,0.045565,-2.594738,0.009466,0.048458
ENSMUSG00000064368.1,22839.111328,-0.193450,0.067116,-2.882303,0.003948,0.024492
ENSMUSG00000064369.1,699.643921,-0.480432,0.153563,-3.128569,0.001757,0.012877
ENSMUSG00000064370.1,204708.515625,0.010465,0.037681,0.277720,0.781228,0.896732


In [8]:
result_df['log2FoldChange'] = -result_df['log2FoldChange']

In [9]:
# calculate gene to rp correspondance
df_gene = pd.read_csv(r'../scRNA_Greenberg/gene_info.csv', index_col=0)
df_gene['position'] = (df_gene['start'] + df_gene['end'])/2

df_rp = pd.read_csv(r"E:\DNA_analysis\Postanalysis_MeCP2\radial_position\all_rp_bulk_600pts_MOp_data.csv", 
                    index_col=0)
df_rp['chr'] = df_rp['loci_name'].apply(lambda x: x.split('_')[0])
df_rp['start'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[1]))
df_rp['end'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[2]))
df_rp['position'] = (df_rp['start'] + df_rp['end'])/2

rps = []

for i, row in df_gene.iterrows():
    df = df_rp[df_rp['chr']==row['chr']].copy()
    if len(df)>0:
        df['distance'] = np.abs(df['position']-row['position'])
        df.sort_values('distance', inplace=True)
        if df['distance'].values[0]>=3000000:
            rps.append(-1)
        else:
            rps.append(df['norm_RP'].values[0])
    else:
        rps.append(-1)
        
df_gene['radial_position'] = rps

gene_to_rp = {gene:rp for (gene, rp) in zip(df_gene['gene'].values, df_gene['radial_position'].values)}

In [10]:
result_df['gene_id'] = result_df.index.values
result_df['gene_id'] = result_df['gene_id'].apply(lambda x: x.split('.')[0])
result_df['gene_symbol'] = result_df['gene_id'].apply(lambda x: id_to_gene[x])

result_df['radial_position'] = result_df['gene_symbol'].apply(lambda x: gene_to_rp[x] if x in gene_to_rp.keys() else -1)
result_df = result_df[result_df.radial_position!=-1].copy()
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_symbol,radial_position
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUSG00000051951.5,4596.516113,-0.034803,0.076375,0.455683,0.648618,0.819599,ENSMUSG00000051951,Xkr4,0.816454
ENSMUSG00000102331.1,1054.419067,-0.064543,0.091055,0.708838,0.478425,0.702964,ENSMUSG00000102331,Gm19938,0.816454
ENSMUSG00000025902.13,137.862686,0.148995,0.234180,-0.636238,0.524621,0.739653,ENSMUSG00000025902,Sox17,0.816454
ENSMUSG00000033845.13,1123.310181,0.014854,0.032770,-0.453284,0.650344,0.820574,ENSMUSG00000033845,Mrpl15,0.816454
ENSMUSG00000025903.14,780.722717,0.097769,0.071962,-1.358608,0.174271,0.387185,ENSMUSG00000025903,Lypla1,0.816454
...,...,...,...,...,...,...,...,...,...
ENSMUSG00000087201.1,246.728470,0.031221,0.140724,-0.221858,0.824424,0.919110,ENSMUSG00000087201,Gm15261,0.813251
ENSMUSG00000031352.10,880.621155,0.080394,0.084533,-0.951033,0.341588,0.580268,ENSMUSG00000031352,Hccs,0.813251
ENSMUSG00000087159.7,186.388474,0.065399,0.171091,-0.382245,0.702279,0.854276,ENSMUSG00000087159,Gm15246,0.813251
ENSMUSG00000035299.16,1022.197205,0.982708,0.572896,-1.715334,0.086284,0.241777,ENSMUSG00000035299,Mid1,0.797202


In [11]:
result_df.to_csv(r'resources\Tillotson_KO_deg_rp.csv')

# Calculate DEG for MM2

In [12]:
clinical_df_mm2 = sample_info[sample_info['sample'].str.startswith('MBD')].copy()
clinical_df_mm2 = clinical_df_mm2.loc[:, ['sample', 'genotype']].copy()
clinical_df_mm2.set_index('sample', inplace=True)
clinical_df_mm2

Unnamed: 0_level_0,genotype
sample,Unnamed: 1_level_1
MBD725,MM2
MBD726,WT
MBD731,MM2
MBD732,WT
MBD743,WT
MBD744,MM2
MBD745,WT
MBD746,MM2
MBD755,WT
MBD756,MM2


In [13]:
### filter by gene count
count_thresh = 100
gene_average = np.average(count_df_mm2, axis=0)
sel_genes = count_df_mm2.columns[np.where(gene_average>=count_thresh)]
count_df_filtered = count_df_mm2.loc[:, sel_genes]

dds = DeseqDataSet(
            counts=count_df_filtered,
            clinical=clinical_df_mm2,
            design_factors="genotype",  # compare samples based on the "condition"
            refit_cooks=True,
            n_cpus=16,
        )

dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=16)

stat_res.summary()
result_df = stat_res.results_df

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 5.79 seconds.

Fitting dispersion trend curve...
... done in 3.83 seconds.

Fitting MAP dispersions...
... done in 6.54 seconds.

Fitting LFCs...
... done in 0.64 seconds.

Refitting 3 outliers.

Fitting dispersions...
... done in 0.00 seconds.

Fitting MAP dispersions...
... done in 0.02 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.65 seconds.

Log2 fold change & Wald test p-value: genotype WT vs MM2


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSMUSG00000051951.5,4090.121826,-0.053989,0.041385,-1.304546,0.192047,0.472019
ENSMUSG00000103377.1,843.138550,-0.056735,0.042888,-1.322844,0.185887,0.463352
ENSMUSG00000104017.1,691.141968,-0.083131,0.051945,-1.600369,0.109517,0.339696
ENSMUSG00000103025.1,220.277527,-0.099667,0.113323,-0.879494,0.379133,0.683098
ENSMUSG00000103201.1,363.525726,0.022826,0.097988,0.232941,0.815807,0.936834
...,...,...,...,...,...,...
ENSMUSG00000064367.1,135905.953125,0.075661,0.102571,0.737645,0.460730,0.748575
ENSMUSG00000064368.1,19861.871094,0.096163,0.105602,0.910618,0.362497,0.668473
ENSMUSG00000064369.1,434.716187,-0.111393,0.309667,-0.359719,0.719057,0.900577
ENSMUSG00000064370.1,175136.187500,0.080332,0.062284,1.289767,0.197132,0.479197


In [14]:
result_df['log2FoldChange'] = -result_df['log2FoldChange']

result_df['gene_id'] = result_df.index.values
result_df['gene_id'] = result_df['gene_id'].apply(lambda x: x.split('.')[0])
result_df['gene_symbol'] = result_df['gene_id'].apply(lambda x: id_to_gene[x] if x in id_to_gene.keys() else 'Not_available')

result_df['radial_position'] = result_df['gene_symbol'].apply(lambda x: gene_to_rp[x] if x in gene_to_rp.keys() else -1)
result_df = result_df[result_df.radial_position!=-1].copy()
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_id,gene_symbol,radial_position
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUSG00000051951.5,4090.121826,0.053989,0.041385,-1.304546,0.192047,0.472019,ENSMUSG00000051951,Xkr4,0.816454
ENSMUSG00000102331.1,954.116760,0.116306,0.084319,-1.379358,0.167784,0.436185,ENSMUSG00000102331,Gm19938,0.816454
ENSMUSG00000025902.13,126.702423,-0.193505,0.154297,1.254112,0.209801,0.498010,ENSMUSG00000025902,Sox17,0.816454
ENSMUSG00000033845.13,955.503296,-0.057657,0.038656,1.491560,0.135814,0.384387,ENSMUSG00000033845,Mrpl15,0.816454
ENSMUSG00000025903.14,636.016479,-0.067686,0.083605,0.809586,0.418178,0.715875,ENSMUSG00000025903,Lypla1,0.816454
...,...,...,...,...,...,...,...,...,...
ENSMUSG00000087201.1,239.363037,0.320776,0.132009,-2.429962,0.015100,0.086485,ENSMUSG00000087201,Gm15261,0.813251
ENSMUSG00000031352.10,794.040833,-0.023109,0.079700,0.289954,0.771852,0.919911,ENSMUSG00000031352,Hccs,0.813251
ENSMUSG00000087159.7,148.259903,0.144424,0.073435,-1.966687,0.049219,0.201466,ENSMUSG00000087159,Gm15246,0.813251
ENSMUSG00000035299.16,1377.652344,0.110472,0.369593,-0.298902,0.765015,0.917703,ENSMUSG00000035299,Mid1,0.797202


In [15]:
result_df.to_csv(r'resources\Tillotson_MM2_deg_with_rp.csv')