In [1]:
#@Author: Zhuowen Li
#@LastEdit: 2021/10/13 上午10:17:24
#@Version: v2
#@Description: 
#base on the single read met file generated by nanopolish 
#batch process

# generete the met count on each chimeric fragments on read

In [1]:
import pandas as pd

In [3]:
#the single read methylation file generated by nanopolish
met_read = pd.read_table('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/rep2_part.tsv')

In [37]:
from pathlib import Path
from pathlib import PurePath
import pyarrow
import pyarrow.parquet as pq
import numpy as np
from concurrent import futures
porec_dir = '/public/home/lizw/task/pore_c/porec_1000_filter_mainchr_result'
prefix = 'DpnII_run06'
align_dir = Path(PurePath(porec_dir,'align_table'))
align_dir_porec_generator = align_dir.glob(prefix + "*pore_c.parquet")
align_batch_porec_merge_df = pyarrow.concat_tables(list(map(pq.read_table,align_dir_porec_generator))).to_pandas()
align_batch_porec_merge_df_pass = align_batch_porec_merge_df.query('pass_filter==True')
align_batch_porec_merge_df_pass_reindex = align_batch_porec_merge_df_pass.reindex(['chrom','start','end','strand','read_name'],axis=1)

met_read = pd.read_feather('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/rep2_part.feather')
sample_reads = list(met_read['read_name'].drop_duplicates().sample(10000))
sample_read_table = align_batch_porec_merge_df_pass_reindex.query('read_name in @sample_reads')
sample_read_table.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_read_table.csv',index=None)
sample_met_table = met_read.query('read_name in @sample_reads')
sample_met_table.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_met_table.csv',index=None)

In [47]:
%%writefile "/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_fragment_met_level.py"
import pandas as pd
sample_read_table = pd.read_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_read_table.csv')
sample_met_table = pd.read_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_met_table.csv')

#A positive value in the log_lik_ratio column indicates support for methylation
#all are CG, need not to be identified
#some are single nucleus acid resolution, but some are not. The judge are for all the CG inculded in the sequence.
#the context call across two framgment would be abandoned

cpu_count = 30
batch_size = len(sample_read_table) // cpu_count
batch_dict = dict(tuple(sample_read_table.groupby(np.arange(len(sample_read_table))//batch_size)))
#process or thread?
def multi_thread(func,iter_object,max_workers):
    workers = min(max_workers,len(iter_object))
    with futures.ThreadPoolExecutor(workers) as excutors:
        res = excutors.map(func,iter_object)
    return list(res)

def met_to_fragment_batch(key):
    df = batch_dict[key]
    met_level_list = []
    for items in df.itertuples(index=False):
        chrom,start,end,strand,read_name = items
        chrom_int = int(chrom)
        sub_df = sample_met_table.query('(read_name == @read_name) & (chromosome == @chrom_int) & (start >= @start) & (end < @end)')
        #according to Bing Ren: Only reads containing 2 or more CpGs on each end were included
        CG_count = sub_df['num_motifs'].sum()
        if CG_count >= 2:
            met_sub_df = sub_df.query('log_lik_ratio > 0')
            met_sub_value = met_sub_df['num_motifs'].sum()/CG_count
        else:
            met_sub_value = np.nan
        
        met_level_list.append(met_sub_value)
    
    df['met_level'] = met_level_list
    return df

met_and_read_df = pd.concat(multi_thread(met_to_fragment_batch,list(batch_dict.keys()),cpu_count),axis=0)
met_and_read_df.to_csv('/public/home/lizw/task/pore_c/methylation/met_structure/sample_fragment_met_levels.csv')

Overwriting /public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_fragment_met_level.py


# split into pair-wise contat

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np
from itertools import combinations

met_level_read = pd.read_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/sample_fragment_met_levels.csv')
read_table_met_value_no_na = met_level_read.dropna()
twoD_df = pd.DataFrame([])
for i in tqdm(read_table_met_value_no_na.groupby(by='read_name')):
    read_name,sub_df = i
    if len(sub_df) >= 2:
        index_list = sub_df.index
        index_cc_list = list(combinations(index_list,2))
        for cc in index_cc_list:
            index1,index2 = cc
            df_cc1 = pd.DataFrame(sub_df.loc[index1,:]).T
            df_cc2 = pd.DataFrame(sub_df.loc[index2,:]).T
            df_cc1.reset_index(drop=True, inplace=True)
            df_cc2.reset_index(drop=True, inplace=True)
            contact_sub_df = pd.concat([df_cc1,df_cc2],axis=1,ignore_index=True)
            twoD_df = twoD_df.append(contact_sub_df)

# genomic distance filter

In [None]:
twoD_df.columns=['drop1','chrom1','start1','end1','strand1','read_name1','met_level1','drop2','chrom2','start2','end2','strand2','read_name2','met_level2']
twoD_df.reset_index(inplace=True)
twoD_df.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/2D_filtered.csv')

twoD_df = pd.read_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/2D_filtered.csv')

twoD_df['start1'] = twoD_df['start1'].astype(int)
twoD_df['start2'] = twoD_df['start2'].astype(int)
twoD_df['end1'] = twoD_df['end1'].astype(int)
twoD_df['end2'] = twoD_df['end2'].astype(int)

twoD_df['pos1'] = twoD_df.eval('(start1+end1)//2',engine='python')
twoD_df['pos2'] = twoD_df.eval('(start2+end2)//2',engine='python')

#distance_filter = 1_000_000

twoD_df_trans = twoD_df.query('chrom1 != chrom2')
twoD_df_cis = twoD_df.query('chrom1 == chrom2')
twoD_df_cis_dis = twoD_df_cis.assign(distance = lambda df:abs(df['pos2']-df['pos1']))
# twoD_df_cis_filter_1k = twoD_df_cis_dis.query('distance >= 1000')
# twoD_df_cis_filter_5k = twoD_df_cis_dis.query('distance >= 5000')
twoD_df_cis_filter_25k = twoD_df_cis_dis.query('distance >= 25000')
twoD_df_cis_filter_100k = twoD_df_cis_dis.query('distance >= 100000')
twoD_df_cis_filter_1M = twoD_df_cis_dis.query('distance >= 1000000')

twoD_df_cis_filter_25k.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/twoD_df_cis_filter_25k.csv')
twoD_df_cis_filter_100k.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/twoD_df_cis_filter_100k.csv')
twoD_df_cis_filter_1M.to_csv('/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/twoD_df_cis_filter_1M.csv')

# shuffle generating

In [None]:
def met_contact_corr(twoD_df_cis_filter):
    size = len(twoD_df_cis_filter)
    inread_met_level1 = twoD_df_cis_filter['met_level1'].astype('float')
    inread_met_level2 = twoD_df_cis_filter['met_level2'].astype('float')
    read_r = inread_met_level1.corr(inread_met_level2)
    import random
    from scipy import stats
    shuffle_met_level1 = list(inread_met_level1)
    shuffle_met_level2 = list(inread_met_level2)
    random.shuffle(shuffle_met_level1)
    random.shuffle(shuffle_met_level2)
    shuffle_r, shuffle_p = stats.pearsonr(shuffle_met_level1, shuffle_met_level2)
    return read_r,shuffle_r,size

In [None]:
dict_25k = dict(zip(('Long-C contact','shuffle','size'),met_contact_corr(twoD_df_cis_filter_25k)))
dict_100k = dict(zip(('Long-C contact','shuffle','size'),met_contact_corr(twoD_df_cis_filter_100k)))
dict_1M = dict(zip(('Long-C contact','shuffle','size'),met_contact_corr(twoD_df_cis_filter_1M)))

# plot

In [None]:
%config InlineBackend.figure_format = 'retina'
corr_list = [dict_25k,dict_100k,dict_1M]

corr_df_all = pd.DataFrame([])
for i in corr_list:
    read_r = i['Long-C contact']
    shuffle_r = i['shuffle']
    corr_table = pd.DataFrame({'Long-C Contact':[read_r],'Shuffled':[shuffle_r]})
    corr_df_all = corr_df_all.append(corr_table)

corr_df_all.index = ['25kb','100kb','1Mb']
import matplotlib.pyplot as plt
from cycler import cycler
import matplotlib as mpl
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.font_manager as font_manager
import matplotlib as mpl
import matplotlib.patches as mp
import seaborn as sns
import matplotlib as mpl
from cycler import cycler
#for more
mpl.rcParams['axes.prop_cycle'] = cycler('color',['#B2957A','#A0A0A0'])
#for three

# 设置全局字体
font_dirs = ['/public/home/mowp/test/fonts/']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 15
plt.rcParams['svg.fonttype'] = 'none'

fig,ax = plt.subplots(figsize=(3,4))
corr_df_all.loc[('25kb','100kb','1Mb'),:].plot(kind = 'bar',ax=ax,width = 0.7)
sns.despine(ax=ax)
ax.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad=0)
plt.savefig(f'/public/home/lizw/task/pore_c/methylation/nanopolish/rep2/all_filter.png',bbox_inches='tight',dpi=300)