### Imports and loading config data

In [None]:
from itertools import combinations_with_replacement
import pandas as pd
import numpy as np
import dcor
import yaml
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, acf
from matplotlib.collections import PolyCollection, LineCollection
from matplotlib.lines import Line2D

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load configuration file
with open('params_icgcl.yaml', 'rb') as f:
    conf = yaml.safe_load(f.read())
# Set seaborn style
sns.set_style(style='white')
# Plot settings
plt.rc('text', usetex=True)
plt.rc('font', **{'family': "sans-serif"})
plt.rc('text.latex', preamble=r"\usepackage{amsmath}" r"\usepackage{amstext}")
plt.rcParams["axes.linewidth"] = 2.50
plt.rcParams['xtick.major.size'] = 20
plt.rcParams['ytick.major.size'] = 20
fsz = 28

In [None]:
# Load general settings from the config file
save_img = conf['settings']['save_img']
active_dataset = conf["settings"]["active_dataset"]
dataset_config = conf["datasets"][active_dataset]

# Load dataset-specific parameters
ec_hc_mask_folder = dataset_config["ec_hc_mask_folder"]
input_file = dataset_config["input_file"]
input_folder = dataset_config["input_folder"]
output_folder = dataset_config["output_folder"]
rep_list = dataset_config["rep_list"]

In [None]:
df_list_treat = {'WRN ASO': 'c', 'HGPS ASO': '#66CDAA', 'WT': 'g', 'WRN SCR': 'r', 'WRN NT':'#D70040', 'HGPS SCR': 'm', 'HGPS NT':'#8B008B'}

### Data Preprocessing and Analysis functions

In [None]:
def data_preprocessing(df, expt_list):
    normalized_df = df[['Gene_ID', 'StartPos', 'EndPos']].copy()
    normalized_df[expt_list] = df[expt_list].div(df[expt_list].sum(axis=0), axis=1)
    return normalized_df.reset_index(drop=True)

### Correlation Analysis Functions

In [None]:
def get_pos_val(df,expt):
    tp_gene = np.array(df[expt]) # Here are the transcription values
    pos_start_end = df[['StartPos','EndPos']].values.tolist()
    return  pos_start_end, tp_gene

In [None]:
def convert_to_sorted_dict(pos_tuple,tp_gene):
    pos_value = dict()
    for i in range(len(pos_tuple)):
        start_end = pos_tuple[i]
        start_end=(start_end[0],start_end[1])
        value = tp_gene[i]
        pos_value[start_end] = value
    return dict(sorted(pos_value.items()))

In [None]:
def get_distance(pos_tuple,tp_gene):
    bin = dict()
    # Here the pos_value is a dictionary with key as the position and value as the transcription.
    # The pos value is sorted based on the position
    pos_value = convert_to_sorted_dict(pos_tuple,tp_gene)
    prev_key_1 = ""
    # we generate now combinations like
    # a b
    # a c
    # a d
    # b c
    # b d
    #....
    # where a and b would be the start positions (or keys in the dictionary)
    # now we calculate the distance based on the relative index and not on the start position
    # Hence we keep i and j and counters for the indices
    i=-1
    j=-1
    for key_1, key_2 in combinations_with_replacement(pos_value.keys(), r = 2):
        if prev_key_1!=key_1:
            i=i+1
            prev_key_1=key_1
            j=i
        else:
            j=j+1
        value_1 = pos_value[key_1]
        value_2 = pos_value[key_2]
        tuple_tp = (value_1,value_2)
        bin.setdefault(j-i, []).append(tuple_tp)
    return bin

In [None]:
def calculate_binwise_correlation(bin):
    binwise_correlation = {}
    for bin_idx,tuple_list in bin.items():
        x_y = list(map(list, zip(*tuple_list)))
        x = x_y[0]
        y = x_y[1]
        cor = dcor.distance_correlation(np.array(x),np.array(y))
        binwise_correlation[bin_idx] = cor
    return binwise_correlation

In [None]:
def get_acf_pacf_intercept(dict_values):
    a, ci = acf(x=np.array(list(dict_values.values())), nlags=len(dict_values)-1, alpha=0.05)
    # Key step
    centered_ci = ci - a[:,None]
    outside = np.abs(a) >= centered_ci[:,1]
    intercept = (np.where(outside == False)[0][0])-1
    return intercept

In [None]:
def get_and_plot_acf_pacf(dict_values_ec,dict_values_hc, save_img,save_plot_path):
    # intercept = get_acf_pacf_intercept(dict_values)
    intercept_corr_ec, intercept_corr_hc =  save_acf_pacf_plot(dict_values_ec, dict_values_hc, save_plot_path, save_img)  
    return intercept_corr_ec, intercept_corr_hc

### Visualization Functions

In [None]:
def save_cluster(df, save_plot_path):
    fig, ax = plt.subplots()
    fig.set_size_inches(12, 10)
    fig.set_dpi(150)
    df['index'] = range(1, len(df)+1)
    df['log_index'] = np.log10(df['index'])
    palette = [ "#008000","#FF0000"]
    for i in df.columns[0:3]:
        sns.scatterplot(ax=ax, x="index", y=i, data=df, hue="labels",s=fsz+50,palette=sns.color_palette(palette, 3),linewidth=0, alpha = 0.7)

    ax.set_ylabel(r'$\alpha_t$', fontsize = fsz, color='k')
    ax.set_xlabel('gene expression rank ($\mathrm{r}_t$)', fontsize=fsz, color='k')
    ax.tick_params('both',which='major', length=7,labelsize=fsz)
    ax.tick_params('both',which='minor', length=7,labelsize=fsz)
    ax.set(xscale= 'log')
    ax.set(yscale= 'log')
    labels = ['High abundance transcript (HAT)','Low abundance transcript (LAT)']
    legend_elements = [Line2D([0], [0], marker='o',color = 'r', lw=0, markerfacecolor='r', label=labels[0],
                                  markersize=8),
                        Line2D([0], [0], marker='o',color = 'g', lw=0, markerfacecolor='g', label=labels[1],
                                  markersize=8)]

    ax.legend(handles=legend_elements,markerscale=2, loc='upper right', borderaxespad=0.05, fontsize = fsz-4)
    plt.savefig(save_plot_path)
    plt.close()

In [None]:
def plot_acf_colors(ax, markercolor="red", linecolor="black", facecolor="red", barcolor="red", linewidth=1):
    """utility function to get some control over colors with  plot_acf()"""
    for item in ax.collections:
        # change the color of the confidence interval
        if type(item) == PolyCollection:
            item.set_facecolor(facecolor)
        # change the color of the vertical lines
        if type(item) == LineCollection:
            item.set_color(barcolor)
    # change the color of the markers
    [line.get_label() for line in ax.lines]
    for item in ax.lines:
        item.set_color(markercolor)
    # change the color of the horizontal lines
    ax.lines[0].set_color(linecolor)
    ax.lines[0].set_linewidth(linewidth)
    return ax

In [None]:
# This is used to plot Fig 1
def save_acf_pacf_plot(dict_values_ec,dict_values_hc,save_plot_path,save_img):
    intercept_ec = get_acf_pacf_intercept(dict_values_ec)
    intercept_hc = get_acf_pacf_intercept(dict_values_hc)
    if save_img:
        fig, ax = plt.subplots(1,2)
        fig.set_size_inches(14, 7)
        fig.set_dpi(150)

        plot_acf(x=np.array(list(dict_values_ec.values())),lags=len(dict_values_ec)-1,ax=ax[0])
        plot_acf(x=np.array(list(dict_values_hc.values())),lags=len(dict_values_hc)-1,ax=ax[1])

        ax[0].tick_params('both',which='major', length=7,labelsize=fsz)
        ax[0].tick_params('both',which='minor', length=7,labelsize=fsz)
        ax[0].set_ylabel('$\mathcal{R}(\ell)$', fontsize=fsz, color='k')
        ax[0].set_xlabel('$\ell$', fontsize=fsz, color='k')

        ax[1].tick_params('both',which='major', length=7,labelsize=fsz)
        ax[1].tick_params('both',which='minor', length=7,labelsize=fsz)
        ax[1].set_xlabel('$\ell$', fontsize=fsz, color='k')
        ax[0].set_ylim(bottom=-0.5, top=1)

        ax[0] = plot_acf_colors(ax[0],markercolor="red", linecolor="black", facecolor="red", barcolor="red", linewidth=1)
        ax[1] = plot_acf_colors(ax[1],markercolor="green", linecolor="black", facecolor="green", barcolor="green", linewidth=1)

        for i in range(1,len(ax)):
            ax[i].set_ylim( ax[0].get_ylim() ) # align axes
            ax[i].set_yticks([]) # set ticks to be empty (no ticks, no tick-labels)

        ax[0].vlines(x=intercept_ec, ymin=-0.5, ymax=1, colors="black", ls='--', lw=2, alpha=1)
        ax[1].vlines(x=intercept_hc, ymin=-0.5, ymax=1, colors="black", ls='--', lw=2, alpha=1)
        plt.savefig(save_plot_path, format='pdf', bbox_inches='tight')
        plt.close(fig)
    return intercept_ec, intercept_hc

In [None]:
# This is used to plot Fig 2
def get_and_plot_dcor(bin_ec_idx_cor_dict,bin_hc_idx_cor_dict,cm_ec,cm_hc,save_img,save_plot_path):
    if save_img:
        fig, ax = plt.subplots(1,1)
        fig.set_size_inches(12, 8)
        fig.set_dpi(520)
        plt.plot(list(bin_ec_idx_cor_dict.keys()),list(bin_ec_idx_cor_dict.values()),color = cm_ec,linewidth = 3)
        plt.plot(list(bin_hc_idx_cor_dict.keys()),list(bin_hc_idx_cor_dict.values()),color = cm_hc, linewidth = 3)
        ax.tick_params('both',which='major', length=7,labelsize=fsz)
        ax.tick_params('both',which='minor', length=7,labelsize=fsz)
        ax.set_ylabel('$\mathcal{C}(\Delta \ell)$', fontsize=fsz, color='k')
        ax.set_xlabel('$\Delta \ell$', fontsize=fsz, color='k')

        ax.tick_params('both',which='major', length=7,labelsize=fsz)
        # ax.set_xlim(left=0, right=3500)
        plt.savefig(save_plot_path, format='pdf', bbox_inches='tight')
        plt.close(fig)
    return 

In [None]:
# This is used to plot Fig 7
def plot_for_rank(df,save_plot_path):
    fig, ax = plt.subplots()
    fig.set_size_inches(12, 10)
    fig.set_dpi(150)
    df_list_treat = {'ASO': 'c', 'WT': 'g', 'WRN': 'r', 'HGPS': 'm'} # Creating a dictionary for the color for different treatments
    ind =[]
    ind_1 =[]
    gene_val = []
    gene_val_1 = []
    expt_val = []
    gene_ref = 'ENSG00000117595'
    gene_ref_1 = 'ENSG00000162433'
    for col in df.columns[1:]:
        df_new = df[['Gene_ID', col]].sort_values(by = col, ascending=False,ignore_index = True).reset_index(drop=True)
        x = np.linspace(1,len(df_new), len(df_new))

        ind.append(df_new.index[df_new['Gene_ID'] == gene_ref].tolist())
        ind_1.append(df_new.index[df_new['Gene_ID'] == gene_ref_1].tolist())
        gene_val.append(df_new[df_new['Gene_ID'] == gene_ref][col].tolist())
        gene_val_1.append(df_new[df_new['Gene_ID'] == gene_ref_1][col].tolist())
        expt_val.append(col)
        temp = [i for i, s in enumerate(df_list_treat) if s in col]
        ax.plot(x,df_new[col].to_numpy(),'*', label = col, color = list(df_list_treat.items())[int(temp[0])][1],alpha=1.0)
        ax.set_ylabel(r'$\alpha_t$', fontsize = fsz+6, color='k')
        ax.set_xlabel('gene expression rank ($\mathrm{r}_t$)', fontsize=fsz+6, color='k')
        ax.tick_params('both',which='major', length=7,labelsize=fsz)
        ax.tick_params('both',which='minor', length=7,labelsize=fsz)
        # ax.set_ylim(bottom=1e-5, top=1e-2)
        ax.set(xscale= 'log')
        ax.set(yscale= 'log')

    ind = [item for sublist in ind for item in sublist]
    ind_1 = [item for sublist in ind_1 for item in sublist]
    gene_val = [item for sublist in gene_val for item in sublist]
    gene_val_1 = [item for sublist in gene_val_1 for item in sublist]

    df_sort = pd.DataFrame(list(zip(ind, gene_val, expt_val)), columns=['ind','val','expt'])
    df_sort_1 = pd.DataFrame(list(zip(ind_1, gene_val_1, expt_val)), columns=['ind_1','val_1','expt'])
    
    col_order = ['WT_1_1','WT_2_1','WT_3_1', 'HGPS_917_L1_ASO_1','HGPS_478_L1_ASO_1','HGPS_297_L1_ASO_1', 'WRN_1_L1_ASO_1','WRN_2_L1_ASO_1','WRN_3_L1_ASO_1','HGPS_917_SCR_1','HGPS_478_SCR_1','HGPS_297_SCR_1','WRN_1_SCR_1','WRN_2_SCR_1','WRN_3_SCR_1','HGPS_917_NT_1','HGPS_478_NT_1','HGPS_297_NT_1','WRN_1_NT_1','WRN_2_NT_1','WRN_3_NT_1']
    
    # Create a categorical column with the custom order
    df_sort['expt_cat'] = pd.Categorical(df_sort['expt'], categories=col_order, ordered=True)
    
    # Sort the dataframe based on the new categorical column
    df_sorted = df_sort.sort_values('expt_cat')
    # If you want to drop the temporary categorical column
    df_sorted = df_sorted.drop('expt_cat', axis=1)    
    # Reset the index if needed
    df_sort = df_sorted.reset_index(drop=True)

    df_sort = pd.merge(df_sort, df_sort_1, on="expt")
    handles, labels = ax.get_legend_handles_labels()
    ## specify order of items in legend
    order = [0,1,2,3,4,5,6,7,8,9,10,11,18,19,20,12,13,14,15,16,17]
        ## add legend to plot
    ax.legend([handles[idx] for idx in order],[labels[idx] for idx in order],markerscale=2, bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.05, fontsize = fsz-10)
    x = np.linspace(1,len(df_sort), len(df_sort))
    y = df_sort['ind']
    y1 = df_sort['ind_1']
    left, bottom, width, height = [0.205, 0.33, 0.33, 0.33]
    ax1 = fig.add_axes([left, bottom, width, height])

    ax1.plot(x,y,'ob')
    ax1.set_xticks(x)
    ax1.set_xticklabels(list(df_sort['expt']))
    ax1.grid(False)
    ax1.set_ylabel('$\mathrm{r}_t$', fontsize=fsz, color='k')
    ax1.tick_params('both',which='major', length=7,labelsize=fsz-12)
    ax1.tick_params('both',which='minor', length=7,labelsize=fsz-12)
    plt.xticks(rotation=90)

    ax2 = ax1.twinx()
    ax2.plot(y1,'or')
    # Setting the color for the axis
    color = []
    for col in df_sort['expt']:
        temp = [i for i, s in enumerate(df_list_treat) if s in col]
        color.append(list(df_list_treat.items())[temp[0]][1])
    i = 0
    for xtick in ax1.get_xticklabels():
        xtick.set_color(color[i])
        i = i + 1
    ax2.tick_params('y',which='major', length=7,labelsize=fsz-12)
    ax2.tick_params('y',which='minor', length=7,labelsize=fsz-12)
    ax1.yaxis.label.set_color('b')
    ax2.spines["left"].set_edgecolor('b')
    ax1.tick_params(axis='y', colors='b')
    ax2.yaxis.label.set_color('r')
    ax2.spines["right"].set_edgecolor('r')
    ax2.tick_params(axis='y', colors='r')
    plt.savefig(save_plot_path, bbox_inches='tight')
    plt.close()

### Function to generate output Summary

In [None]:
def generate_summary_file(output_folder):
    auto_cor_ratio_df = pd.DataFrame()
    total_genes_df = pd.DataFrame()
    counter=1
    for root, dirs, files in os.walk(output_folder):
        for dir in dirs:
            file = os.path.join(output_folder,dir,dir+"_auto_cor_ratio.csv")
            file_1 = os.path.join(output_folder,dir,dir+"_ec_genes_df.csv")
            df = pd.read_csv(file)
            df_1 = pd.read_csv(file_1)
            if counter == 1:
                expt_list = list(df.columns)
                auto_cor_ratio_df = pd.DataFrame(columns=expt_list)
                total_genes_df = pd.DataFrame(columns=expt_list)
                counter = counter+1
            auto_cor_ratio_df = pd.concat([auto_cor_ratio_df, df],ignore_index=True)
            total_genes_df = pd.concat([total_genes_df, df_1],ignore_index=True)
    auto_cor_ratio_df.to_csv(os.path.join(output_folder,"auto_cor_ratio.csv"),index=False)
    total_genes_df.to_csv(os.path.join(output_folder,"total_genes_df.csv"),index=False)

### Main Processing Pipeline

In [None]:
df = pd.read_csv(input_file)
df['Chromosome']=df['Chromosome'].astype(str)
df.sort_values("Gene_ID",inplace=True)
count = 0
expt_list = [item for sublist in rep_list for item in sublist]

for chromosome,data in df.groupby('Chromosome'):
    if chromosome.isalnum() and chromosome != 'MT' and chromosome != 'Y':
        chromosome_output_folder = os.path.join(output_folder,"chromosome_"+str(chromosome))
        Path(chromosome_output_folder).mkdir(parents=True, exist_ok=True)
        auto_cor_ratio_df = pd.DataFrame(columns= ['chromosome_id'] + expt_list)
        total_genes_df = pd.DataFrame(columns= ['chromosome_id'] + expt_list)
        normalized_df = data_preprocessing(data,expt_list)
        auto_cor_ratio_list = [chromosome]
        total_genes_list = [chromosome]    
        precomp_file = os.path.join(ec_hc_mask_folder, f"chromosome_{chromosome}" ,f"chromosome_{chromosome}_ec_hc_mask_xy.csv")
        if not os.path.exists(precomp_file):
            print(f"Precomputed HAT/LAT data not found for chromosome {chromosome}. Skipping...")
            continue

        new_df = pd.read_csv(precomp_file)
        data_hi_folder = os.path.join(ec_hc_mask_folder,f"chromosome_{chromosome}" ,"data_hi")
        data_lo_folder = os.path.join(ec_hc_mask_folder,f"chromosome_{chromosome}" , "data_lo")
        for rep in rep_list:
            data_hi_file = os.path.join(data_hi_folder, f"chromosome_{chromosome}_rep_{rep[0]}_data_hi.csv")
            data_lo_file = os.path.join(data_lo_folder, f"chromosome_{chromosome}_rep_{rep[0]}_data_lo.csv")
            
            if os.path.exists(data_hi_file) and os.path.exists(data_lo_file):
                data_hi = pd.read_csv(data_hi_file, index_col=0)
                data_lo = pd.read_csv(data_lo_file, index_col=0)
            else:
                print(f"Data files not found for replicate {rep[0]} in chromosome {chromosome}. Exiting...")
                break
    
            for expt in rep:
                print("Processing Chromosome :"+str(chromosome)+" and experiment "+str(expt))
                positions_ec,tp_gene_ec = get_pos_val(data_hi,expt)
                positions_hc,tp_gene_hc = get_pos_val(data_lo,expt)
                bin_ec_idx_dis_dict = get_distance(positions_ec,tp_gene_ec)
                bin_hc_idx_dis_dict = get_distance(positions_hc,tp_gene_hc)
                bin_ec_idx_cor_dict = calculate_binwise_correlation(dict(sorted(bin_ec_idx_dis_dict.items())))    
                bin_hc_idx_cor_dict = calculate_binwise_correlation(dict(sorted(bin_hc_idx_dis_dict.items())))
                # Let's calculate the intercept for both ec and hc
                intercept_corr_ec, intercept_corr_hc = get_and_plot_acf_pacf(bin_ec_idx_cor_dict, bin_hc_idx_cor_dict, save_img, os.path.join(chromosome_output_folder,(expt+"_autocorr_HAT_LAT.pdf")))
                get_and_plot_dcor(bin_ec_idx_cor_dict,bin_hc_idx_cor_dict,"red","green",save_img,os.path.join(chromosome_output_folder,(expt+"_dcor_HAT_LAT.pdf"))) 
                
                scaling_factor = (len(positions_ec)+len(positions_hc))
                intercept_corr_ec = (intercept_corr_ec+1) * 1000 / scaling_factor
                auto_cor_ratio_list.append(intercept_corr_ec)
                total_genes_list.append((len(positions_hc)+len(positions_ec)))
                print("intercept_corr_ec", intercept_corr_ec,scaling_factor)
           
        auto_cor_ratio_df = auto_cor_ratio_df.append(pd.DataFrame([auto_cor_ratio_list], columns=['chromosome_id'] + expt_list), ignore_index=True)
        auto_cor_ratio_df.to_csv(os.path.join(chromosome_output_folder,"chromosome_"+str(chromosome)+"_auto_cor_ratio.csv"),index=False)
        total_genes_df = total_genes_df.append(pd.DataFrame([total_genes_list], columns=['chromosome_id'] + expt_list), ignore_index=True)
        total_genes_df.to_csv(os.path.join(chromosome_output_folder,"chromosome_"+str(chromosome)+"_ec_genes_df.csv"),index=False)
generate_summary_file(output_folder)