In [1]:
import os
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

import pandas as pd
import numpy as np

# Load data

In [2]:
ko_file = 'GSE128178_10WT_10MeCP2_KO_whole_cell_RNAseq_exon_counts.txt'
df_count = pd.read_csv(ko_file, sep='\t', index_col=0)
count_df = df_count.transpose()
count_df

Gene_name,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Oprk1,Npbwr1,...,Gm20871,Gm20823,Gm20736,Gm20852,Ssty2,Gm20816,Gm20867,Gm20806,Gm20917,Gm20854
MeCP2_WT_1,863,3,48,511,360,193,392,3577,94,24,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_2,648,0,69,456,374,239,325,3117,136,33,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_3,887,2,61,549,409,279,346,3547,167,41,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_4,741,0,43,570,287,217,352,3023,101,19,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_5,788,1,46,499,291,222,311,2930,115,35,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_6,723,1,69,478,299,230,298,2828,146,17,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_7,703,3,68,607,323,188,366,3016,92,23,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_8,827,0,83,566,325,194,376,3354,117,28,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_9,739,1,77,566,286,212,367,3041,102,27,...,0,0,0,0,0,0,0,0,0,0
MeCP2_WT_10,886,1,63,507,347,270,416,3580,143,39,...,0,0,0,0,0,0,0,0,0,0


In [3]:
samples = count_df.index.values

genotype = []
for sample in samples:
    genotype.append(sample.split('_')[1])

clinical_df = pd.DataFrame({'sample':samples, 'genotype':genotype})
clinical_df.set_index('sample', inplace=True)
clinical_df

Unnamed: 0_level_0,genotype
sample,Unnamed: 1_level_1
MeCP2_WT_1,WT
MeCP2_WT_2,WT
MeCP2_WT_3,WT
MeCP2_WT_4,WT
MeCP2_WT_5,WT
MeCP2_WT_6,WT
MeCP2_WT_7,WT
MeCP2_WT_8,WT
MeCP2_WT_9,WT
MeCP2_WT_10,WT


# Calculate DEG

In [4]:
### filter by gene count
count_thresh = 100
gene_average = np.average(count_df, axis=0)
sel_genes = count_df.columns[np.where(gene_average>=count_thresh)]
count_df_filtered = count_df.loc[:, sel_genes]

In [5]:
dds = DeseqDataSet(
            counts=count_df_filtered,
            clinical=clinical_df,
            design_factors="genotype",  # compare samples based on the "condition"
            refit_cooks=True,
            n_cpus=16,
        )

In [6]:
dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=16)

stat_res.summary()
result_df = stat_res.results_df

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 3.92 seconds.

Fitting dispersion trend curve...
... done in 2.57 seconds.

Fitting MAP dispersions...
... done in 4.32 seconds.

Fitting LFCs...
... done in 0.47 seconds.

Refitting 0 outliers.

Running Wald tests...
... done in 0.48 seconds.

Log2 fold change & Wald test p-value: genotype WT vs KO


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Xkr4,766.238342,0.064206,0.046337,1.385620,1.658631e-01,3.270039e-01
Mrpl15,542.768860,-0.051870,0.042414,-1.222941,2.213518e-01,3.976817e-01
Lypla1,343.567902,-0.092329,0.090516,-1.020035,3.077119e-01,4.937652e-01
Tcea1,231.435577,-0.067757,0.089919,-0.753530,4.511312e-01,6.285329e-01
Rgs20,318.898376,0.335675,0.043549,7.707946,1.278592e-14,1.111879e-12
...,...,...,...,...,...,...
Spry3,220.166351,0.124207,0.090891,1.366548,1.717670e-01,3.356937e-01
Kdm5d,332.571289,-0.110288,0.066622,-1.655428,9.783770e-02,2.256462e-01
Eif2s3y,152.227768,-0.243633,0.094224,-2.585685,9.718575e-03,4.178727e-02
Uty,378.359924,-0.078291,0.079218,-0.988303,3.230045e-01,5.099126e-01


In [7]:
result_df['log2FoldChange'] = -result_df['log2FoldChange']

# Add radial position

In [8]:
df_gene = pd.read_csv(r'../scRNA_Greenberg/gene_info.csv', index_col=0)
df_gene['position'] = (df_gene['start'] + df_gene['end'])/2
df_gene.head()

Unnamed: 0,gene,chr,start,end,position
0,0610006L08Rik,chr7,74818817,74853813,74836315.0
1,0610007P14Rik,chr12,85815447,85824550,85819998.5
2,0610009B22Rik,chr11,51685385,51688874,51687129.5
3,0610009E02Rik,chr2,26445695,26459390,26452542.5
4,0610009L18Rik,chr11,120348677,120351190,120349933.5


In [11]:
df_rp = pd.read_csv(r"E:\DNA_analysis\Postanalysis_MeCP2\radial_position\all_rp_bulk_600pts_MOp_data.csv", 
                    index_col=0)
df_rp['chr'] = df_rp['loci_name'].apply(lambda x: x.split('_')[0])
df_rp['start'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[1]))
df_rp['end'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[2]))
df_rp['gene_position'] = (df_rp['start'] + df_rp['end'])/2
df_rp.head()

Unnamed: 0,loci_name,norm_RP,chr,start,end,gene_position
0,chr1_3742742_3759944,0.816454,chr1,3742742,3759944,3751343.0
1,chr1_6245958_6258969,0.800965,chr1,6245958,6258969,6252463.5
2,chr1_8740008_8759916,0.816597,chr1,8740008,8759916,8749962.0
3,chr1_9627926_9637875,0.79397,chr1,9627926,9637875,9632900.5
4,chr1_9799472_9811359,0.786175,chr1,9799472,9811359,9805415.5


In [12]:
rps = []

for i, row in df_gene.iterrows():
    df = df_rp[df_rp['chr']==row['chr']].copy()
    if len(df)>0:
        df['distance'] = np.abs(df['gene_position']-row['position'])
        df.sort_values('distance', inplace=True)
        if df['distance'].values[0]>=3000000:
            rps.append(-1)
        else:
            rps.append(df['norm_RP'].values[0])
    else:
        rps.append(-1)
        
df_gene['radial_position'] = rps

In [13]:
gene_to_rp = {gene:rp for (gene, rp) in zip(df_gene['gene'].values, df_gene['radial_position'].values)}

result_df['gene'] = result_df.index.values
result_df['radial_position'] = result_df['gene'].apply(lambda x: gene_to_rp[x] if x in gene_to_rp.keys() else -1)
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,766.238342,-0.064206,0.046337,1.385620,1.658631e-01,3.270039e-01,Xkr4,0.816454
Mrpl15,542.768860,0.051870,0.042414,-1.222941,2.213518e-01,3.976817e-01,Mrpl15,0.816454
Lypla1,343.567902,0.092329,0.090516,-1.020035,3.077119e-01,4.937652e-01,Lypla1,0.816454
Tcea1,231.435577,0.067757,0.089919,-0.753530,4.511312e-01,6.285329e-01,Tcea1,0.816454
Rgs20,318.898376,-0.335675,0.043549,7.707946,1.278592e-14,1.111879e-12,Rgs20,0.816454
...,...,...,...,...,...,...,...,...
Spry3,220.166351,-0.124207,0.090891,1.366548,1.717670e-01,3.356937e-01,Spry3,-1.000000
Kdm5d,332.571289,0.110288,0.066622,-1.655428,9.783770e-02,2.256462e-01,Kdm5d,-1.000000
Eif2s3y,152.227768,0.243633,0.094224,-2.585685,9.718575e-03,4.178727e-02,Eif2s3y,-1.000000
Uty,378.359924,0.078291,0.079218,-0.988303,3.230045e-01,5.099126e-01,Uty,-1.000000


In [14]:
result_df = result_df[result_df.radial_position!=-1].copy()
result_df.to_csv(r'resources\Boxer_Mecp2_KO_result_df_allRP.csv')
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,766.238342,-0.064206,0.046337,1.385620,1.658631e-01,3.270039e-01,Xkr4,0.816454
Mrpl15,542.768860,0.051870,0.042414,-1.222941,2.213518e-01,3.976817e-01,Mrpl15,0.816454
Lypla1,343.567902,0.092329,0.090516,-1.020035,3.077119e-01,4.937652e-01,Lypla1,0.816454
Tcea1,231.435577,0.067757,0.089919,-0.753530,4.511312e-01,6.285329e-01,Tcea1,0.816454
Rgs20,318.898376,-0.335675,0.043549,7.707946,1.278592e-14,1.111879e-12,Rgs20,0.816454
...,...,...,...,...,...,...,...,...
Frmpd4,1591.109131,-0.046597,0.041763,1.115752,2.645281e-01,4.464316e-01,Frmpd4,0.813251
Msl3,311.147705,-0.039393,0.054687,0.720337,4.713178e-01,6.453367e-01,Msl3,0.813251
Arhgap6,130.925293,0.219666,0.086181,-2.548902,1.080625e-02,4.523304e-02,Arhgap6,0.813251
Hccs,271.465881,-0.033653,0.081761,0.411600,6.806328e-01,8.083837e-01,Hccs,0.813251
