<!-- cluster size ~ data count (cluster num reads num contacts num)

chr count pct < 5% 弃掉最少chr ~ data count 

cis pct > 10%  ~ data count

近距离互作 pair > 1000 ~ data count

弃掉trans? ~ data count

去掉背景spot -->


In [None]:
#!/usr/bin/env python
# encoding: utf-8
import os
import assembly
from itertools import combinations
import pandas as pd
from concurrent import futures
from functools import partial
from tqdm import tqdm
import yaml
import subprocess

def spot_sprite_to_higashi_v2(spot_id:str,in_spot_sprite_dir:str,out_spot_higashi_dir:str,chroms:list,
                            min_cluster_size:int,max_cluster_size:int,close_contact_distance:int,
                            count_normalize:bool,out_start_pos:bool):
    os.makedirs(out_spot_higashi_dir,exist_ok=True)
    in_spot_sprite_f=os.path.join(in_spot_sprite_dir,spot_id)
    out_spot_higashi_f=os.path.join(out_spot_higashi_dir,f'{spot_id}.contact.tsv')
    out_spot_higashi_tmp_f=out_spot_higashi_f+'.tmp'

    close_contact_num=0
    with open(in_spot_sprite_f,'r') as f,open(out_spot_higashi_tmp_f ,'w') as out:
        for line in f:
            cluster_reads = set(line.rstrip().split()[1:])
            cluster_size = len(cluster_reads)
            if min_cluster_size <= cluster_size <= max_cluster_size:
                cluster_chr_reads_dict={chr:[] for chr in chroms}
                for read in cluster_reads:
                    chr=read.split(']_')[1].split(':')[0]
                    start,end=read.split(']_')[1].split(':')[1].split('-')
                    cluster_chr_reads_dict[chr].append([start,end])

                for chr,cluster_chr_cis_reads in cluster_chr_reads_dict.items():
                    if len(cluster_chr_cis_reads) >=2:
                        for cluster_chr_cis_2_reads_comb in list(combinations(cluster_chr_cis_reads,2)):
                            pair1=cluster_chr_cis_2_reads_comb[0]
                            pair1_start=int(pair1[0])
                            pair1_end=int(pair1[1])
                            assert pair1_end>=pair1_start
                            pair2=cluster_chr_cis_2_reads_comb[1]
                            pair2_start=int(pair2[0])
                            pair2_end=int(pair2[1])
                            assert pair2_end>=pair2_start

                            if pair1_start <= pair2_start:
                                pass
                            else:
                                pair1_start,pair1_end,pair2_start,pair2_end=pair2_start,pair2_end,pair1_start,pair1_end

                            if count_normalize:
                                count_normalize=1/(cluster_size-1)
                            else:
                                count_normalize=1

                            if (pair2_end-pair1_start >= close_contact_distance) or (pair2_start-pair1_start >= close_contact_distance):
                                if out_start_pos:
                                #  chrom1 pos1_start/pos1_end chrom2 pos2_start/pos2_end count[default:1]
                                    out.write('\t'.join([chr,str(pair1_start),chr,str(pair2_start),str(count_normalize)])+'\n')
                                else:
                                    out.write('\t'.join([chr,str(pair1_end),chr,str(pair2_end),str(count_normalize)])+'\n')
                            else:
                                close_contact_num+=1

                    else:
                        pass
            else:
                pass
    subprocess.run(f"sort -k1,1V -k2,2n -k3,3V -k4,4n {out_spot_higashi_tmp_f} |uniq > {out_spot_higashi_f}", shell=True)
    os.remove(out_spot_higashi_tmp_f)
    subprocess.run(f"gzip -f {out_spot_higashi_f}", shell=True)

    return close_contact_num

In [None]:
def sample_sprite_to_higashi(cpus:int,spot_infor_f:str,in_spot_sprite_dir:str,
                            out_spot_higashi_dir:str,chroms:list,
                            min_cluster_size:int,max_cluster_size:int,close_contact_distance:int,
                            out_spot_infor_f:str,count_normalize:bool,out_start_pos:bool):
    spot_infor=pd.read_csv(spot_infor_f)
    spotid_list=spot_infor.spot_id.unique().tolist()

    with futures.ProcessPoolExecutor(max_workers=cpus) as pool:
        func = partial(spot_sprite_to_higashi_v2,in_spot_sprite_dir=in_spot_sprite_dir,
                       out_spot_higashi_dir=out_spot_higashi_dir,chroms=chroms,
                            min_cluster_size=min_cluster_size,max_cluster_size=max_cluster_size,
                            close_contact_distance=close_contact_distance,
                            count_normalize=count_normalize,out_start_pos=out_start_pos)
        close_contact_nums = list(tqdm(pool.map(func, spotid_list), total=len(spotid_list)))
        spot_infor['cis_close_contact_num']=close_contact_nums
        spot_infor['cis_close_contact_ratio']=spot_infor['cis_close_contact_num']/spot_infor['spot_fcsizefchrrnpfCsofcis_cis_ct_num']
        spot_infor.to_csv(out_spot_infor_f,index=False)


### main function

In [None]:

work_dir='/home/spaceA'
species='mm10'
min_cluster_size=2
max_cluster_size=1000
chr_lst=list(assembly.build(species, 1)._chromsizes.keys())
cpus=40
close_contact_distance=1_000 #ref microC
count_normalize=True
out_start_pos=True


def main(sample_name):
    print(f'\nNow process:{sample_name}')
    in_spot_sprite_dir=os.path.join(work_dir,f'SpatialSPRITE_res/Filter_Spot_v4_tmp/clusters_{sample_name}_single_filtered_new_v2')
    out_spot_higashi_dir=os.path.join(work_dir,f'higashi_pre_v2/{sample_name}')
    spot_infor_f=os.path.join('/home/spaceA/SpatialSPRITE_res/Filter_Spot_v4_tmp',
                              f'{sample_name}_spot_infor_final_v2.csv')
    out_spot_infor_f =os.path.join('/home/spaceA/higashi_v2',sample_name+'_spot_infor_final_filtercloseCT.csv')

    spot_infor=pd.read_csv(spot_infor_f)
    spotid_list=spot_infor.spot_id.unique().tolist()

    if not os.path.exists(out_spot_infor_f):
        sample_sprite_to_higashi(cpus=cpus,spot_infor_f=spot_infor_f,
                                in_spot_sprite_dir=in_spot_sprite_dir,
                                out_spot_higashi_dir=out_spot_higashi_dir,chroms=chr_lst,
                                min_cluster_size=min_cluster_size,max_cluster_size=max_cluster_size,
                                close_contact_distance=close_contact_distance,
                                out_spot_infor_f=out_spot_infor_f,
                                count_normalize=count_normalize,
                                out_start_pos=out_start_pos)
    
    out_spot_higashi_filelst_f=os.path.join(work_dir,f'higashi_v2/{sample_name}/filelist.txt')
    os.makedirs(os.path.join(work_dir,f'higashi_v2/{sample_name}'),exist_ok=True)

    with open(out_spot_higashi_filelst_f,'w') as f:
        for spot in spotid_list:
            spot_f_path=os.path.join(out_spot_higashi_dir,f'{spot}.contact.tsv.gz')
            f.write(f'{spot_f_path}\n')
    print(f'{sample_name} done!')

In [None]:
with open("/home/spaceA/config_v2.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

sampleid_list=config['spatial_infor'].keys()

for sample in sampleid_list:
    main(sample)