In [None]:
# script caculates the correlation of at NDRs (-150 to +50 from TSS) 
# of highly expressed & unexpressed genes in bone marrow & lymph nodes

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from itertools import chain

In [10]:
samples = ['HV01', 'HV03', 'HV04', 'HV05', 'HV06', 'HV07', 'HV08']

correlation_values = []
for sample in samples:
    # normalize coverage 
    coverage_file = f'{sample}_cov_3000.parquet'
    coverage_df = pd.read_parquet(coverage_file)

    up_cov_mean = coverage_df.iloc[:2000].mean(axis=0).to_numpy()
    down_cov_mean = coverage_df.iloc[4000:].mean(axis=0).to_numpy()
    cov_mean = (up_cov_mean + down_cov_mean)/2

    norm_cov_df = coverage_df.div(cov_mean).fillna(0)
    norm_cov_df = norm_cov_df.loc[:, (norm_cov_df != 0).any(axis=0)]
    
    # normalize wps
    wps_file = f'{sample}_wps_3000.parquet'
    wps_df = pd.read_parquet(wps_file)

    up_wps_mean = wps_df.iloc[:2000].mean(axis=0).to_numpy()
    down_wps_mean = wps_df.iloc[4000:].mean(axis=0).to_numpy()
    wps_mean = (up_wps_mean + down_wps_mean)/2

    norm_wps_df = wps_df.div(wps_mean).fillna(0)
    norm_wps_df = norm_wps_df.loc[:, (norm_wps_df != 0).any(axis=0)]
    
    
    # get list of highly expressed & unexpressed gene names
    expression_df = pd.read_parquet('/mnt/DATA1/resources/protein_atlas/RNAtable.parquet')
    bsh = expression_df['bone_marrow'].sort_values(ascending=False).head(100).index.values.tolist()
    bsl = expression_df['bone_marrow'].sort_values(ascending=True).head(100).index.values.tolist()

    lsh= expression_df['lymph_node'].sort_values(ascending=False).head(100).index.values.tolist()
    lsl= expression_df['lymph_node'].sort_values(ascending=True).head(100).index.values.tolist()
    
    
    # find correlation of coverage & wps in bone marrow
    bsh_df = expression_df['bone_marrow'].sort_values(ascending=False).head(100)
    bsl_df = expression_df['bone_marrow'].sort_values(ascending=True).head(100)
    bm_ex = pd.concat([bsh_df, bsl_df])
    bm_list = bsh + bsl 

    cov_bm_df = norm_cov_df.loc[2849:3049, norm_cov_df.columns.isin(bm_list)]
    cov_bm_mean = cov_bm_df.mean()
    wps_bm_df = norm_wps_df.loc[2849:3049, norm_wps_df.columns.isin(bm_list)]
    wps_bm_mean = wps_bm_df.mean()


    bm_ex_cov = bm_ex.to_frame().join(cov_bm_mean.to_frame())
    bm_ex_cov.columns=['expression', 'ndr_mean_cov']
    bm_ex_wps = bm_ex.to_frame().join(wps_bm_mean.to_frame())
    bm_ex_wps.columns=['expression', 'ndr_mean_wps']
    
    bm_cov_corr = bm_ex_cov.corr().iloc[0,1]
    bm_cov_wps = bm_ex_wps.corr().iloc[0,1]
    
    
    # find correlation of coverage & wps in lymph nodes
    lsh_df = expression_df['lymph_node'].sort_values(ascending=False).head(100)
    lsl_df = expression_df['lymph_node'].sort_values(ascending=True).head(100)
    ln_ex = pd.concat([lsh_df, lsl_df])
    ln_list = lsh + lsl 

    cov_ln_df = norm_cov_df.loc[2849:3049, norm_cov_df.columns.isin(ln_list)]
    cov_ln_mean = cov_ln_df.mean()
    wps_ln_df = norm_wps_df.loc[2849:3049, norm_wps_df.columns.isin(ln_list)]
    wps_ln_mean = wps_ln_df.mean()


    ln_ex_cov = ln_ex.to_frame().join(cov_ln_mean.to_frame())
    ln_ex_cov.columns=['expression', 'ndr_mean_cov']
    ln_ex_wps = ln_ex.to_frame().join(wps_ln_mean.to_frame())
    ln_ex_wps.columns=['expression', 'ndr_mean_wps']
    
    ln_cov_corr = ln_ex_cov.corr().iloc[0,1]
    ln_cov_wps = ln_ex_wps.corr().iloc[0,1]
    
    sample_values = [sample, bm_cov_corr, bm_cov_wps, ln_cov_corr, ln_cov_wps]
    correlation_values.append(sample_values) 

In [12]:
df = pd.DataFrame(correlation_values, columns = ['Sample', 'Bone marrow coverage', 'Bone marrow WPS',\
                                                'Lymph node coverage', 'Lymph node WPS']).set_index('Sample')
df

Unnamed: 0_level_0,Bone marrow coverage,Bone marrow WPS,Lymph node coverage,Lymph node WPS
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HV01,-0.139639,-0.188245,-0.190909,-0.230405
HV03,-0.125717,-0.16828,-0.231903,-0.248139
HV04,-0.113334,-0.139627,-0.229407,-0.214912
HV05,-0.044189,-0.135252,-0.186396,-0.19325
HV06,-0.080972,-0.153601,-0.139838,-0.106486
HV07,-0.172259,-0.208925,-0.197126,-0.191562
HV08,-0.190339,-0.210753,-0.20224,-0.202041
