### Imports

In [None]:
# Package imports
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
# import libraries for biological data analysis
from coolpuppy import coolpup
from coolpuppy.lib import numutils
from coolpuppy.lib.numutils import get_domain_score, get_enrichment
from coolpuppy.lib.puputils import accumulate_values
from coolpuppy.lib.puputils import divide_pups
from coolpuppy import plotpup

from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
import cooler
import bioframe
import cooltools
from cooltools import expected_cis, expected_trans
from cooltools.lib import plotting
import cooler as clr
import h5py

import time

### Set directory to mcools

In [None]:
file_path = "" # mcool file path
mcool_files = [""] # list of mcool file names
resolution = 5000 # resolution for analysis

### Compute expected

In [None]:
# Get chr arms
# Use bioframe to fetch the genomic features from the UCSC.
mm39_chromsizes = bioframe.fetch_chromsizes('mm39')
mm39_cens = bioframe.fetch_centromeres('mm39', provider='ucsc')
mm39_cens = mm39_cens.astype({'start':int, 'end':int, 'mid':int})
# mm39_cens = mm39_cens.astype(int)
mm39_arms = bioframe.make_chromarms(mm39_chromsizes, mm39_cens)

# call this to automaticly assign names to chromosomal arms:
mm39_arms = bioframe.make_viewframe(mm39_arms)
mm39_arms

In [None]:
# Compute expected ... takes some time
expected_df_list = []

# Path for output of expected calculation
output_names = ['expected/' + name.split(".")[0] + '_expected_cis.tsv' for name in mcool_files]
for mcool_file, output_name in zip(mcool_files, output_names):
    curr_cooler = read_cooler(os.path.join(file_path, mcool_file), resolution)
    expected = expected_cis(
        curr_cooler,
        ignore_diags=0,
        view_df=mm39_arms,
        chunksize=1000000)
    expected_df_list.append(expected)
    expected.to_csv(output_name, sep='\t', index=False, header=True) # Let's save to use in CLI

### Compute enrichment scores

#### cohesin loops

In [None]:
loops_path = "" # path to loop calls

tsv_files = len(mcool_files)*[""] # path to cohesin loops
cohesin_loops_scores = get_all_domain_scores(mcool_files, tsv_files, file_path, loops_path, 
                                                resolution, expected_df_list, mm39_arms)

#### ep loops

In [None]:
tsv_files = len(mcool_files)*[""] # path to ep loops
ep_loops_scores = get_all_domain_scores(mcool_files, tsv_files, file_path, loops_path, 
                                                resolution, expected_df_list, mm39_arms)

#### pp loops

In [None]:
tsv_files = len(mcool_files)*[""] # path to pp loops
pp_loops_scores = get_all_domain_scores(mcool_files, tsv_files, file_path, loops_path, 
                                                resolution, expected_df_list, mm39_arms)

#### zfp143 loops

In [None]:
tsv_files = len(mcool_files)*[""] # path to zfp143 loops
znf_loops_scores = get_all_domain_scores(mcool_files, tsv_files, file_path, loops_path, 
                                                resolution, expected_df_list, mm39_arms)

### Make plots

#### Combine dataframes

In [None]:
df_merged_znf = reduce(pd_merge_all_domains, enumerate(znf_loops_scores))[1]

all_dfs = [reduce(pd_merge_all_domains, enumerate(loop_scores))[1] for loop_scores in [cohesin_loops_scores, ep_loops_scores,
                                                                                      pp_loops_scores, znf_loops_scores]]
loop_types = ["cohesin", "ep", "pp", "znf143"]
plot_df_list = []
condition_list = [""] # a list of condition names
for df, loop_type in zip(all_dfs, loop_types):
    plot_df = pd.DataFrame(columns=["loop_type"] + condition_list)

    for score in condition_list:
        plot_df[score] = df["domain_score_" + score]
    plot_df["loop_type"] = loop_type
    plot_df_list.append(plot_df)
plot_df = pd.concat(plot_df_list, ignore_index=True)
plot_df

#### Make plot

In [None]:
plot_pairs = [("", "")] # specify condition pairs as tuples
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
for i, pair in enumerate(plot_pairs):
    sns.scatterplot(data=plot_df, x=pair[0], y=pair[1], hue="loop_type", alpha=0.05, ax=axs[i])
    sns.despine()
    axs[i].set_xlim([0, 45])
    axs[i].set_ylim([0, 45])
    axs[i].set_aspect('equal')
plot_dir = "" # output directory

plt.savefig(os.path.join(plot_dir, "name.svg"), format='svg')

# Functions

In [None]:
from functools import reduce
all_suffixes = [] # list of suffixes indicating which mcool file
suff_idx = 0 

ALL_DOMAIN_SCORES = {"domain_score": [], "chrom1": [], "start1": [], "end1": [], 
                     "chrom2": [], "start2": [], "end2": []}

def pd_merge_all_domains(left, right):
    """
    Helper for merging pandas dataframe. 
    """
    pd_merged = pd.merge(left[1],right[1],how='inner', on=["chrom1", "start1", "end1", "chrom2", "start2", "end2"], 
                         suffixes=[all_suffixes[left[0]], all_suffixes[right[0]]])
    return right[0], pd_merged
    
def add_domain_score(snippet):
    """
    Helper function for calculating enrichments from coolpuppy
    """
    snippet['domain_score'] = get_enrichment(snippet['data'], 3) # Calculates enrichment for each snippet 
    for key in ALL_DOMAIN_SCORES.keys():
        ALL_DOMAIN_SCORES[key].append(snippet[key])
    return snippet

def reset_all_domain_scores():
    for key, val in ALL_DOMAIN_SCORES.items():
        val.clear()

def get_all_domain_scores(mcool_files, tsv_files, file_path, loop_path, resolution, 
                          expected_df_list, view_df):
    """
    Function that computes an enrichment score for all loops in `tsv_file` for all mcool files at
    `resolution`.
    """
    domain_scores_list = []
    for mcool_file, tsv_file, expected_df in zip(mcool_files, tsv_files, expected_df_list):
        reset_all_domain_scores()
        loop_df = pd.read_csv(os.path.join(loop_path, tsv_file), sep="\t", 
                      names=["chrom1", "start1", "end1", "chrom2", "start2", "end2"])
        current_cooler = read_cooler(os.path.join(file_path, mcool_file), resolution=resolution)
        cc = coolpup.CoordCreator(loop_df, resolution, rescale_flank=1) #, chroms=chrom_list)
        pp = coolpup.PileUpper(current_cooler, cc, expected=expected_df, view_df=view_df,
                              rescale_size=41, rescale=True)
        test_df = pp.pileupsWithControl(postprocess_func=add_domain_score)
        domain_scores_list.append(pd.DataFrame(ALL_DOMAIN_SCORES.copy()))
        print(domain_scores_list[-1].shape)
    return domain_scores_list
