In [1]:
import os
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

import pandas as pd
import numpy as np

# Load data

In [2]:
ko_file = 'GSE128178_10WT_10MeCP2_KO_whole_cell_RNAseq_exon_counts.txt'
df_count = pd.read_csv(ko_file, sep='\t', index_col=0)
count_df = df_count.transpose()
count_df

Gene_name,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Oprk1,Npbwr1,...,Gm20871,Gm20823,Gm20736,Gm20852,Ssty2,Gm20816,Gm20867,Gm20806,Gm20917,Gm20854
MeCP2_WT_1,863,3,48,511,360,193,392,3577,94,24,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_2,648,0,69,456,374,239,325,3117,136,33,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_3,887,2,61,549,409,279,346,3547,167,41,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_4,741,0,43,570,287,217,352,3023,101,19,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_5,788,1,46,499,291,222,311,2930,115,35,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_6,723,1,69,478,299,230,298,2828,146,17,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_7,703,3,68,607,323,188,366,3016,92,23,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_8,827,0,83,566,325,194,376,3354,117,28,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_9,739,1,77,566,286,212,367,3041,102,27,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_10,886,1,63,507,347,270,416,3580,143,39,...,0,0,0,0,0,0,0,0,0,0


In [3]:
samples = count_df.index.values

genotype = []
for sample in samples:
    genotype.append(sample.split('_')[1])

clinical_df = pd.DataFrame({'sample':samples, 'genotype':genotype})
clinical_df.set_index('sample', inplace=True)
clinical_df

Unnamed: 0_level_0,genotype
sample,Unnamed: 1_level_1
MeCP2_WT_1,WT
MeCP2_WT_2,WT
MeCP2_WT_3,WT
MeCP2_WT_4,WT
MeCP2_WT_5,WT
MeCP2_WT_6,WT
MeCP2_WT_7,WT
MeCP2_WT_8,WT
MeCP2_WT_9,WT
MeCP2_WT_10,WT


# Calculate DEG

In [6]:
# generate count df in terms of counts-per-million
count_df_cpm = count_df.div(count_df.sum(axis=1), axis=0)*1000000
count_df_cpm.head()

Gene_name,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Oprk1,Npbwr1,...,Gm20871,Gm20823,Gm20736,Gm20852,Ssty2,Gm20816,Gm20867,Gm20806,Gm20917,Gm20854
MeCP2_WT_1,46.206309,0.160624,2.569992,27.359703,19.274938,10.333508,20.988266,191.517924,5.0329,1.284996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MeCP2_WT_2,39.070684,0.0,4.160304,27.494185,22.550056,14.41033,19.595637,187.937228,8.20002,1.989711,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MeCP2_WT_3,46.708927,0.105319,3.212226,28.910035,21.537713,14.691985,18.220168,186.783048,8.794127,2.159037,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MeCP2_WT_4,38.887223,0.0,2.256613,29.913248,15.061583,11.388026,18.472743,158.645175,5.300418,0.997108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MeCP2_WT_5,44.086323,0.055947,2.573567,27.917608,16.280609,12.420259,17.399551,163.925034,6.433918,1.958149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# filter out lowly expressed genes.
# a gene is expressed if it has >=1 cpm count in all collected samples
sel_genes = []
for col in count_df_cpm.columns:
    gene_counts = count_df_cpm[col].values
    if np.count_nonzero(gene_counts>=1)>=len(count_df_cpm):
        sel_genes.append(col)
print(len(sel_genes))

13071


In [11]:
### filter by gene count
count_df_filtered = count_df.loc[:, sel_genes]

In [12]:
dds = DeseqDataSet(
            counts=count_df_filtered,
            clinical=clinical_df,
            design_factors="genotype",  # compare samples based on the "condition"
            refit_cooks=True,
            n_cpus=16,
        )

In [13]:
dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=16)

stat_res.summary()
result_df = stat_res.results_df

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.74 seconds.

Fitting dispersion trend curve...
... done in 2.90 seconds.

Fitting MAP dispersions...
... done in 5.11 seconds.

Fitting LFCs...
... done in 0.54 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 0.55 seconds.

Log2 fold change & Wald test p-value: genotype WT vs KO


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Xkr4,765.814575,0.061628,0.046262,1.332174,0.182803,0.356529
Sox17,67.241524,-0.180470,0.121491,-1.485460,0.137422,0.294675
Mrpl15,542.504883,-0.054500,0.037004,-1.472822,0.140799,0.299883
Lypla1,343.346130,-0.094819,0.089846,-1.055349,0.291266,0.484800
Tcea1,231.288177,-0.070463,0.082619,-0.852864,0.393735,0.583637
...,...,...,...,...,...,...
Tmlhe,48.115120,-0.012826,0.092888,-0.138083,0.890175,0.943519
Kdm5d,332.390289,-0.112665,0.066207,-1.701715,0.088809,0.217138
Eif2s3y,152.142609,-0.246132,0.093660,-2.627931,0.008591,0.039326
Uty,378.127228,-0.080866,0.073088,-1.106416,0.268546,0.461006


In [14]:
result_df['log2FoldChange'] = -result_df['log2FoldChange']

# Add radial position

In [17]:
df_gene = pd.read_csv(r'resources/gene_info.csv', index_col=0)
df_gene['position'] = (df_gene['start'] + df_gene['end'])/2
df_gene.head()

Unnamed: 0,gene,chr,start,end,position
0,0610006L08Rik,chr7,74818817,74853813,74836315.0
1,0610007P14Rik,chr12,85815447,85824550,85819998.5
2,0610009B22Rik,chr11,51685385,51688874,51687129.5
3,0610009E02Rik,chr2,26445695,26459390,26452542.5
4,0610009L18Rik,chr11,120348677,120351190,120349933.5


In [18]:
df_rp = pd.read_csv(r"E:\DNA_analysis\Postanalysis_MeCP2\radial_position\all_rp_bulk_600pts_MOp_data.csv", 
                    index_col=0)
df_rp['chr'] = df_rp['loci_name'].apply(lambda x: x.split('_')[0])
df_rp['start'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[1]))
df_rp['end'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[2]))
df_rp['gene_position'] = (df_rp['start'] + df_rp['end'])/2
df_rp.head()

Unnamed: 0,loci_name,norm_RP,chr,start,end,gene_position
0,chr1_3742742_3759944,0.816454,chr1,3742742,3759944,3751343.0
1,chr1_6245958_6258969,0.800965,chr1,6245958,6258969,6252463.5
2,chr1_8740008_8759916,0.816597,chr1,8740008,8759916,8749962.0
3,chr1_9627926_9637875,0.79397,chr1,9627926,9637875,9632900.5
4,chr1_9799472_9811359,0.786175,chr1,9799472,9811359,9805415.5


In [19]:
rps = []

for i, row in df_gene.iterrows():
    df = df_rp[df_rp['chr']==row['chr']].copy()
    if len(df)>0:
        df['distance'] = np.abs(df['gene_position']-row['position'])
        df.sort_values('distance', inplace=True)
        if df['distance'].values[0]>=3000000:
            rps.append(-1)
        else:
            rps.append(df['norm_RP'].values[0])
    else:
        rps.append(-1)
        
df_gene['radial_position'] = rps

In [20]:
gene_to_rp = {gene:rp for (gene, rp) in zip(df_gene['gene'].values, df_gene['radial_position'].values)}

result_df['gene'] = result_df.index.values
result_df['radial_position'] = result_df['gene'].apply(lambda x: gene_to_rp[x] if x in gene_to_rp.keys() else -1)
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,765.814575,-0.061628,0.046262,1.332174,0.182803,0.356529,Xkr4,0.816454
Sox17,67.241524,0.180470,0.121491,-1.485460,0.137422,0.294675,Sox17,0.816454
Mrpl15,542.504883,0.054500,0.037004,-1.472822,0.140799,0.299883,Mrpl15,0.816454
Lypla1,343.346130,0.094819,0.089846,-1.055349,0.291266,0.484800,Lypla1,0.816454
Tcea1,231.288177,0.070463,0.082619,-0.852864,0.393735,0.583637,Tcea1,0.816454
...,...,...,...,...,...,...,...,...
Tmlhe,48.115120,0.012826,0.092888,-0.138083,0.890175,0.943519,Tmlhe,-1.000000
Kdm5d,332.390289,0.112665,0.066207,-1.701715,0.088809,0.217138,Kdm5d,-1.000000
Eif2s3y,152.142609,0.246132,0.093660,-2.627931,0.008591,0.039326,Eif2s3y,-1.000000
Uty,378.127228,0.080866,0.073088,-1.106416,0.268546,0.461006,Uty,-1.000000


In [21]:
result_df = result_df[result_df.radial_position!=-1].copy()
result_df.to_csv(r'resources\Boxer_Mecp2_KO_result_df_allRP.csv')
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,765.814575,-0.061628,0.046262,1.332174,0.182803,0.356529,Xkr4,0.816454
Sox17,67.241524,0.180470,0.121491,-1.485460,0.137422,0.294675,Sox17,0.816454
Mrpl15,542.504883,0.054500,0.037004,-1.472822,0.140799,0.299883,Mrpl15,0.816454
Lypla1,343.346130,0.094819,0.089846,-1.055349,0.291266,0.484800,Lypla1,0.816454
Tcea1,231.288177,0.070463,0.082619,-0.852864,0.393735,0.583637,Tcea1,0.816454
...,...,...,...,...,...,...,...,...
Frmpd4,1590.203369,-0.044001,0.042223,1.042102,0.297364,0.491260,Frmpd4,0.813251
Msl3,310.976654,-0.036868,0.050208,0.734307,0.462762,0.644719,Msl3,0.813251
Arhgap6,130.857559,0.222379,0.084980,-2.616831,0.008875,0.040364,Arhgap6,0.813251
Hccs,271.296936,-0.030968,0.076085,0.407018,0.683995,0.815366,Hccs,0.813251
