In [5]:
import os
import glob
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 500)

import pybedtools as pbt 
pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')

os.chdir('/mnt/bioadhoc-temp/Groups/vd-ay/jreyna/projects/t1d-loop-catalog/')
outdir = 'results/hg38/finemapping/sgls/summary/'
os.makedirs(outdir, exist_ok=True)

In [11]:
# 35 different sample names
fns = glob.glob('results/hg38/finemapping/sgls/*/*.5000.finemap_sgls.tsv')

## Processing all samples

In [13]:
dfs = []
for fn in fns:
      
    sample_name = fn.split('/')[-2].rsplit('.', 1)[0]
    df = pd.read_table(fn)
    df["sample_name"] = sample_name
    df["sgl_id"] = df["chr_snp"].astype(str) + "_" + df["start_snp"].astype(str) + "_" \
                        + df["geneid"].astype(str) + "_" + df["startA_loop"].astype(str) + "_" \
                        + df["startB_loop"].astype(str)
    dfs.append(df)
    
all_df = pd.concat(dfs, ignore_index=True)

In [14]:
all_df.head()

Unnamed: 0,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,-log10_qval_loop,chr_snp,start_snp,end_snp,snp_anchor,chr_gene,start_gene,end_gene,genename,geneid,strand,sample_name,sgl_id
0,1,113830000,113835000,1,113925000,113930000,6.149793,1,113834945,113834946,AnchorA,1,113929323,113929324,HIPK1,ENSG00000163349,+,AT452,1_113834945_ENSG00000163349_113830000_113925000
1,1,113830000,113835000,1,113925000,113930000,6.149793,1,113834945,113834946,AnchorA,1,113929491,113929492,HIPK1-AS1,ENSG00000235527,-,AT452,1_113834945_ENSG00000235527_113830000_113925000
2,19,10270000,10275000,19,10350000,10355000,8.904838,19,10352441,10352442,AnchorB,19,10271092,10271093,ICAM1,ENSG00000090339,+,AT452,19_10352441_ENSG00000090339_10270000_10350000
3,6,25990000,25995000,6,27835000,27840000,8.190038,6,27839048,27839049,AnchorB,6,25992661,25992662,U91328.1,ENSG00000272462,+,AT452,6_27839048_ENSG00000272462_25990000_27835000
4,6,26020000,26025000,6,27835000,27840000,40.049244,6,27839048,27839049,AnchorB,6,26020489,26020490,HIST1H3A,ENSG00000275714,+,AT452,6_27839048_ENSG00000275714_26020000_27835000


#### Group lines with the same key SGL ID

In [17]:
uniq_sgl_df = all_df.groupby('sgl_id').agg({'chr_snp': 'first',
                                            'end_snp': 'first',
                                               '-log10_qval_loop':'min',
                                               #'ppa_snp':'max',
                                               #'rsid': 'first',
                                               'genename': 'first',
                                               'geneid': 'first',
                                               'startA_loop': 'first',
                                               'endA_loop': 'first',
                                               'startB_loop': 'first',
                                               'endB_loop': 'first',
                                               'sample_name': pd.Series.unique,
                                              }).reset_index()

In [18]:
uniq_sgl_df.head()

Unnamed: 0,sgl_id,chr_snp,end_snp,-log10_qval_loop,genename,geneid,startA_loop,endA_loop,startB_loop,endB_loop,sample_name
0,10_11565935_ENSG00000148429_11565000_11610000,10,11565936,5.935856,USP6NL,ENSG00000148429,11565000,11570000,11610000,11615000,[GD09519]
1,10_11565935_ENSG00000271360_11565000_11610000,10,11565936,5.935856,AL512631.2,ENSG00000271360,11565000,11570000,11610000,11615000,[GD09519]
2,10_124533934_ENSG00000189319_124530000_124740000,10,124533935,5.039431,FAM53B,ENSG00000189319,124530000,124535000,124740000,124745000,[GD09519]
3,10_32438031_ENSG00000229327_32345000_32435000,10,32438032,5.005641,AL391839.1,ENSG00000229327,32345000,32350000,32435000,32440000,[GD08996]
4,10_32438031_ENSG00000233825_32345000_32435000,10,32438032,5.005641,AL391839.2,ENSG00000233825,32345000,32350000,32435000,32440000,[GD08996]


In [19]:
uniq_snps = uniq_sgl_df[['chr_snp', 'end_snp']].drop_duplicates()

In [20]:
uniq_snps.shape

(2065, 2)

## Final reformatting for the SGL page DataTable

In [21]:
output_df = uniq_sgl_df.copy(deep=True)
output_df = output_df.reindex()

# join samples names with ", "
def join_names(names):
    l = []
    for name in names:
        l.append(name)
    joined_names = ', '.join(l)
    return(joined_names)
output_df['sample_name'] = output_df['sample_name'].apply(join_names)

# sort and save
output_df.sort_values(by=['chr_snp', 'startA_loop', 'startB_loop'], inplace=True)

In [22]:
output_df.rename(columns={'chr_snp': 'chr', 'end_snp': 'snp_bp'}, inplace=True)

In [23]:
fn = os.path.join(outdir, "finemap.sgls.t1d.tsv")
output_df.to_csv(fn, index=False, sep="\t")