In [1]:
import os
import pickle as pkl

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.utils import load_example_data

import pandas as pd
import numpy as np

# Load data

In [2]:
ko_file = 'GSE128178_10WT_10R306C_whole_cell_RNAseq_exon_counts.txt'
df_count = pd.read_csv(ko_file, sep='\t', index_col=0)
count_df = df_count.transpose()
count_df

Gene_name,Xkr4,Rp1,Sox17,Mrpl15,Lypla1,Tcea1,Rgs20,Atp6v1h,Oprk1,Npbwr1,...,Gm20871,Gm20823,Gm20736,Gm20852,Ssty2,Gm20816,Gm20867,Gm20806,Gm20917,Gm20854
WT_1,698,1,41,502,180,123,290,2675,74,12,...,0,0,0,0,0,0,0,0,0,0
WT_2,578,1,36,419,233,117,234,2580,65,31,...,0,0,0,0,0,0,0,0,0,0
WT_3,728,2,66,601,278,176,360,3177,89,37,...,0,0,0,0,0,0,0,0,0,0
WT_4,765,1,64,562,266,169,335,3191,89,38,...,0,0,0,0,0,0,0,0,0,0
WT_5,906,2,62,527,304,195,265,2936,99,20,...,0,0,0,0,0,0,0,0,0,0
WT_6,718,2,34,480,241,138,244,2575,97,21,...,0,0,0,0,0,0,0,0,0,0
WT_7,868,1,53,568,351,208,387,3827,100,35,...,0,0,0,0,0,0,0,0,0,0
WT_8,932,3,61,525,317,210,336,3639,110,34,...,0,0,0,0,0,0,0,0,0,0
WT_9,873,1,73,490,325,223,363,3475,143,44,...,0,0,0,0,0,0,0,0,0,0
WT_10,1009,0,77,563,428,222,360,3690,145,35,...,0,0,0,0,0,0,0,0,0,0


In [3]:
samples = count_df.index.values

genotype = []
for sample in samples:
    genotype.append(sample.split('_')[0])

clinical_df = pd.DataFrame({'sample':samples, 'genotype':genotype})
clinical_df.set_index('sample', inplace=True)
clinical_df

Unnamed: 0_level_0,genotype
sample,Unnamed: 1_level_1
WT_1,WT
WT_2,WT
WT_3,WT
WT_4,WT
WT_5,WT
WT_6,WT
WT_7,WT
WT_8,WT
WT_9,WT
WT_10,WT


# Calculate DEG

In [4]:
### filter by gene count
count_thresh = 100
gene_average = np.average(count_df, axis=0)
sel_genes = count_df.columns[np.where(gene_average>=count_thresh)]
count_df_filtered = count_df.loc[:, sel_genes]

In [5]:
dds = DeseqDataSet(
            counts=count_df_filtered,
            clinical=clinical_df,
            design_factors="genotype",  # compare samples based on the "condition"
            refit_cooks=True,
            n_cpus=16,
        )

In [6]:
dds.deseq2()
stat_res = DeseqStats(dds, n_cpus=16)

stat_res.summary()
result_df = stat_res.results_df

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 4.15 seconds.

Fitting dispersion trend curve...
... done in 2.60 seconds.

Fitting MAP dispersions...
... done in 4.58 seconds.

Fitting LFCs...
... done in 0.50 seconds.

Refitting 4 outliers.

Fitting dispersions...
... done in 0.01 seconds.

Fitting MAP dispersions...
... done in 0.01 seconds.

Fitting LFCs...
... done in 0.00 seconds.

Running Wald tests...
... done in 0.51 seconds.

Log2 fold change & Wald test p-value: genotype WT vs R306C


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Xkr4,751.030518,0.270700,0.066422,4.075448,0.000046,0.001015
Mrpl15,551.861938,-0.084139,0.046197,-1.821317,0.068559,0.262252
Lypla1,308.190002,-0.108699,0.116376,-0.934031,0.350288,0.661352
Tcea1,181.791443,-0.027427,0.100940,-0.271719,0.785838,0.926273
Rgs20,293.572327,0.286871,0.082687,3.469370,0.000522,0.007408
...,...,...,...,...,...,...
Spry3,208.366119,0.096276,0.104554,0.920831,0.357138,0.667065
Kdm5d,321.656860,-0.114856,0.056458,-2.034354,0.041916,0.192379
Eif2s3y,129.211060,0.128836,0.104560,1.232172,0.217885,0.517453
Uty,349.299622,-0.036228,0.075395,-0.480509,0.630865,0.854889


In [7]:
result_df['log2FoldChange'] = -result_df['log2FoldChange']

# Add radial position

In [8]:
df_gene = pd.read_csv(r'../scRNA_Greenberg/gene_info.csv', index_col=0)
df_gene['position'] = (df_gene['start'] + df_gene['end'])/2
df_gene.head()

Unnamed: 0,gene,chr,start,end,position
0,0610006L08Rik,chr7,74818817,74853813,74836315.0
1,0610007P14Rik,chr12,85815447,85824550,85819998.5
2,0610009B22Rik,chr11,51685385,51688874,51687129.5
3,0610009E02Rik,chr2,26445695,26459390,26452542.5
4,0610009L18Rik,chr11,120348677,120351190,120349933.5


In [9]:
df_rp = pd.read_csv(r"E:\DNA_analysis\Postanalysis_MeCP2\radial_position\all_rp_bulk_600pts_MOp_data.csv", 
                    index_col=0)
df_rp['chr'] = df_rp['loci_name'].apply(lambda x: x.split('_')[0])
df_rp['start'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[1]))
df_rp['end'] = df_rp['loci_name'].apply(lambda x: int(x.split('_')[2]))
df_rp['gene_position'] = (df_rp['start'] + df_rp['end'])/2
df_rp.head()

Unnamed: 0,loci_name,norm_RP,chr,start,end,gene_position
0,chr1_3742742_3759944,0.816454,chr1,3742742,3759944,3751343.0
1,chr1_6245958_6258969,0.800965,chr1,6245958,6258969,6252463.5
2,chr1_8740008_8759916,0.816597,chr1,8740008,8759916,8749962.0
3,chr1_9627926_9637875,0.79397,chr1,9627926,9637875,9632900.5
4,chr1_9799472_9811359,0.786175,chr1,9799472,9811359,9805415.5


In [10]:
rps = []

for i, row in df_gene.iterrows():
    df = df_rp[df_rp['chr']==row['chr']].copy()
    if len(df)>0:
        df['distance'] = np.abs(df['gene_position']-row['position'])
        df.sort_values('distance', inplace=True)
        if df['distance'].values[0]>=3000000:
            rps.append(-1)
        else:
            rps.append(df['norm_RP'].values[0])
    else:
        rps.append(-1)
        
df_gene['radial_position'] = rps

In [11]:
gene_to_rp = {gene:rp for (gene, rp) in zip(df_gene['gene'].values, df_gene['radial_position'].values)}

result_df['gene'] = result_df.index.values
result_df['radial_position'] = result_df['gene'].apply(lambda x: gene_to_rp[x] if x in gene_to_rp.keys() else -1)
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,751.030518,-0.270700,0.066422,4.075448,0.000046,0.001015,Xkr4,0.816454
Mrpl15,551.861938,0.084139,0.046197,-1.821317,0.068559,0.262252,Mrpl15,0.816454
Lypla1,308.190002,0.108699,0.116376,-0.934031,0.350288,0.661352,Lypla1,0.816454
Tcea1,181.791443,0.027427,0.100940,-0.271719,0.785838,0.926273,Tcea1,0.816454
Rgs20,293.572327,-0.286871,0.082687,3.469370,0.000522,0.007408,Rgs20,0.816454
...,...,...,...,...,...,...,...,...
Spry3,208.366119,-0.096276,0.104554,0.920831,0.357138,0.667065,Spry3,-1.000000
Kdm5d,321.656860,0.114856,0.056458,-2.034354,0.041916,0.192379,Kdm5d,-1.000000
Eif2s3y,129.211060,-0.128836,0.104560,1.232172,0.217885,0.517453,Eif2s3y,-1.000000
Uty,349.299622,0.036228,0.075395,-0.480509,0.630865,0.854889,Uty,-1.000000


In [12]:
result_df = result_df[result_df.radial_position!=-1].copy()
result_df

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene,radial_position
Gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Xkr4,751.030518,-0.270700,0.066422,4.075448,0.000046,0.001015,Xkr4,0.816454
Mrpl15,551.861938,0.084139,0.046197,-1.821317,0.068559,0.262252,Mrpl15,0.816454
Lypla1,308.190002,0.108699,0.116376,-0.934031,0.350288,0.661352,Lypla1,0.816454
Tcea1,181.791443,0.027427,0.100940,-0.271719,0.785838,0.926273,Tcea1,0.816454
Rgs20,293.572327,-0.286871,0.082687,3.469370,0.000522,0.007408,Rgs20,0.816454
...,...,...,...,...,...,...,...,...
Frmpd4,1551.674072,-0.102684,0.081466,1.260459,0.207504,0.504719,Frmpd4,0.813251
Msl3,287.475006,-0.042306,0.071778,0.589403,0.555591,0.816044,Msl3,0.813251
Arhgap6,112.292542,0.124587,0.101002,-1.233504,0.217388,0.517194,Arhgap6,0.813251
Hccs,248.675049,-0.065991,0.115512,0.571289,0.567804,0.822618,Hccs,0.813251


In [22]:
result_df.to_csv(r'resources\Boxer_Mecp2_R306C_result_df_all_Rp.csv')