### Imports and loading config data

In [None]:
# Main script to generate energy landscape for each chromosome
import pandas as pd
import numpy as np
import yaml
import os
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from scipy.ndimage import gaussian_filter

import warnings
warnings.filterwarnings('ignore')

In [None]:
## Load configuration file
with open('params_cel.yaml', 'rb') as f:
    conf = yaml.safe_load(f.read())   

## plot settings
plt.rc('text', usetex = True)
plt.rc('font', **{'family' : "sans-serif"})
plt.rc('text.latex', preamble=r"\usepackage{amsmath}"
           r"\usepackage{amstext}")
plt.rcParams["axes.linewidth"] = 2.50
plt.rcParams['xtick.major.size'] = 20
plt.rcParams['ytick.major.size'] = 20
sns.set_style(style='white')
%matplotlib inline
fsz = 28

In [None]:
# Load general settings from the config file
save_img = conf['settings']['save_img']
active_dataset = conf["settings"]["active_dataset"]
dataset_config = conf["datasets"][active_dataset]

# Load dataset-specific parameters
input_file = dataset_config['input_file']
output_folder = dataset_config['output_folder']
ec_hc_mask_folder = dataset_config['ec_hc_mask_folder']
experiment_groups = dataset_config['experiment_groups']
cross_group_pairs = dataset_config['cross_group_pairs']

### Data preprocessing

In [None]:
def data_preprocessing(df,expt_list):
    normalized_df = df[['Gene_ID']]
    for expt in expt_list:
        normalized_df[expt] = df[expt] /  df[expt].max()
    normalized_df = normalized_df.set_index('Gene_ID')
    return  normalized_df

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

### Read and aggregate HAT and LAT gene data

In [None]:
df_ec_hc = pd.DataFrame()
counter = 0

for chromosome_dir in os.listdir(ec_hc_mask_folder):
    f = os.path.join(ec_hc_mask_folder, chromosome_dir)
    if os.path.isdir(f):
        for filename in os.listdir(f):
            if filename.endswith("_ec_hc_mask_xy.csv"):  # Check for files that match the pattern
                file_path = os.path.join(f, filename)  # Get the full path of the file
                df_ec_hc_temp = pd.read_csv(file_path)
                df_ec_hc_temp.drop(['StartPos','EndPos'],axis=1,inplace=True)
                df_ec_hc_temp = df_ec_hc_temp.set_index('Gene_ID')
                if counter == 0:
                    df_ec_hc = df_ec_hc_temp
                else:
                    df_ec_hc = df_ec_hc.add(df_ec_hc_temp, fill_value=0)
                counter = counter+1
                
df_ec_hc['sum'] = df_ec_hc.sum(axis=1)
df_ec_hc = df_ec_hc[df_ec_hc['sum']>0]
index_ec_genes = list(df_ec_hc.index)

### Analysis and Helping functions

In [None]:
def calc_statistics(df,expt,is_from=True):
    df = df[[expt]].reset_index()
    df.sort_values([expt,'Gene_ID'],ascending=False,inplace=True)
    rank =  np.arange(1, len(df)+1 )
    if is_from:
        df['rank_frm'] = rank
        df['norm_rank_frm'] = rank[::-1]
        df['norm_rank_frm'] = ((df['norm_rank_frm']-1)/(df['norm_rank_frm'].max()-1))
        df['weight_from'] = df[expt] # Using just the transcript abundance
    else:
        df['rank_to'] = rank
        df['norm_rank_to'] = rank[::-1]
        df['norm_rank_to'] = ((df['norm_rank_to']-1)/(df['norm_rank_to'].max()-1))
        df['weight_to'] = df[expt] # Using just the transcript abundance

    df = df.set_index('Gene_ID')
    return df

In [None]:
# Aggregate all the weights across different replicate combinations for a cross grp
def create_transition_matrix(frm_expt_lst,to_expt_lst):
    transition_mtrx = pd.DataFrame()
    count=0
    for frm_expt, to_expt in itertools.product(frm_expt_lst, to_expt_lst): # It takes all possible combinations between the two samples
        normalized_df = data_preprocessing(data,frm_expt_lst+to_expt_lst)

        frm_expt_normalized_df = calc_statistics(normalized_df,frm_expt,True)
        to_expt_normalized_df = calc_statistics(normalized_df,to_expt,False)

        frm_to_expt_normalized_df = pd.concat([frm_expt_normalized_df,to_expt_normalized_df], axis=1)

        frm_to_expt_normalized_df['diff_weight']  = (
                    frm_to_expt_normalized_df['weight_to'] - frm_to_expt_normalized_df['weight_from'])
        temp_matrix_df = pd.DataFrame(frm_to_expt_normalized_df.pivot('rank_to', 'rank_frm', 'diff_weight')).fillna(0)
        
        if count ==0:
            transition_mtrx = temp_matrix_df
        else:
            transition_mtrx = transition_mtrx.add(temp_matrix_df, fill_value=0)
        count = count+1
    transition_mtrx = transition_mtrx/count
    return transition_mtrx

In [None]:
def smooth_transition_mtrx(filter_size,transition_mtrx):
    transition_mtrx = gaussian_filter(transition_mtrx,filter_size)
    return transition_mtrx

### Plotting functions

In [None]:
def plot_energy_landscape(energy, chromosome_output_folder, cross_grp):
    plt.figure(figsize = (12,10))
    heatmap = sns.heatmap(energy,cmap=plt.cm.seismic,robust=True,center=0,xticklabels = 100, yticklabels = 100, vmin=-0.00007, vmax=0.00004)
    lab = cross_grp.split('=>')
    # Removing the underscore from the labels and add hyphen instead
    if lab[0] != 'WT':
        if lab[0].split('_')[1] =='ASO':
            lab[0] = lab[0].split('_')[0] + '-L1-' + lab[0].split('_')[1]
        else:
            lab[0] = lab[0].split('_')[0] + '-' + lab[0].split('_')[1]

    if lab[1] != 'WT':
        if lab[1].split('_')[1] =='ASO':
            lab[1] = lab[1].split('_')[0] + '-L1-' + lab[1].split('_')[1]
        else:
            lab[1] = lab[1].split('_')[0] + '-' + lab[1].split('_')[1]

    heatmap.set_xlabel(lab[0],fontsize = fsz, color='k')
    heatmap.set_ylabel(lab[1],fontsize = fsz, color='k')
    heatmap.tick_params(axis='y', length=1)
    heatmap.tick_params(axis='x', length=1)
    heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize = fsz)
    heatmap.set_yticklabels(heatmap.get_xmajorticklabels(), fontsize = fsz)
    cax = heatmap.figure.axes[-1]
    cax.set_ylabel(r'$\mathrm{d} U/k_{\mathrm{B}}T$', size=fsz+4)
    cax.tick_params(labelsize=fsz, length=1)
    cax.yaxis.offsetText.set(size=fsz-8) # This will also keep the size of 10^(-4) that will go on top of colorbar
    for _, spine in heatmap.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(2)
        spine.set_color("black")
    filename= cross_grp+"_hm.png"
    plt.savefig(os.path.join(chromosome_output_folder, filename),format='png', bbox_inches='tight',dpi=512)
    plt.close()

In [None]:
# Let's try to get the area of the above triangle and area of the lower triangle
def plot_auc_2D(energy):

    # The below steps are performed to keep the characterisitc shape of the matrices
    indices = np.triu_indices(energy.shape[0], k=0)
    energy_ut = np.zeros_like(energy)
    energy_ut[indices] = energy[indices]
    indices = np.tril_indices(energy.shape[0])
    lower_tri = energy[indices]
    # Reshape the lower triangle elements to a 2D array
    n = energy.shape[0]
    energy_lt = np.zeros((n, n), dtype=energy.dtype)
    energy_lt[indices] = lower_tri

    # Next step is to calculate the double integral of the upper and the lower triangle along with the difference
    # between the two using the trapz function.
    auc_2d_ut_x = np.zeros(len(energy))
    auc_2d_lt_x = np.zeros(len(energy))
    for i in range(len(energy)):
        auc_2d_ut_x[i] = np.trapz(abs(energy_ut[i]), axis=0)
        auc_2d_lt_x[i] = np.trapz(abs(energy_lt[i]), axis=0)

    area_ut = np.trapz(abs(auc_2d_ut_x), axis=0)
    area_lt = np.trapz(abs(auc_2d_lt_x), axis=0)

    ## Let's find out the scaling factor
    # Now we will perform the integral same as above to get the normalization. Here, we will construct a matrix
    # with ones where energy_ut and energy_lt is non-zero
    energy_ut_ones = np.where(energy_ut != 0, 1, 0)
    energy_lt_ones = np.where(energy_lt != 0, 1, 0)
    
    auc_2d_ut_x = np.zeros(len(energy))
    auc_2d_lt_x = np.zeros(len(energy))
    for i in range(len(energy)):
        auc_2d_ut_x[i] = np.trapz(abs(energy_ut_ones[i]), axis=0)
        auc_2d_lt_x[i] = np.trapz(abs(energy_lt_ones[i]), axis=0)
    area_ut_norm = np.trapz(abs(auc_2d_ut_x), axis=0)
    area_lt_norm = np.trapz(abs(auc_2d_lt_x), axis=0)
    # Let's normalize the area now
    area_ut = area_ut/area_ut_norm
    area_lt = area_lt/area_lt_norm
    return area_ut,area_lt

### Main workflow pipeline for Energy Landscape

In [None]:
df = pd.read_csv(input_file)
box_df = pd.DataFrame(columns=['experiment','chromosome','energy'])
filters = np.linspace(0,5,11)
df['Chromosome']=df['Chromosome'].astype(str)

for chromosome,data in df.groupby('Chromosome'):
    if chromosome.isalnum() and chromosome != 'MT' and chromosome != 'Y':
        print("Processing chromosome = "+str(chromosome))
        appended_area = []
        chromosome_output_folder = os.path.join(output_folder,"chromosome_"+str(chromosome))
        Path(chromosome_output_folder).mkdir(parents=True, exist_ok=True)
        for filter_size in filters:
            print("Analyzing for filter size = " + str(filter_size))
            col_label = []
            area_data_ut = [filter_size]
            area_data_lt = []
            area_data_diff = []
            for cross_grp in  cross_group_pairs:
                cross_grp_vals = cross_group_pairs[cross_grp]
                cross_grp_frm = cross_grp_vals[0]
                cross_grp_to = cross_grp_vals[1]
                frm_expt_lst = experiment_groups[cross_grp_frm]
                to_expt_lst = experiment_groups[cross_grp_to]
                print(cross_grp)

                data = data[data['Gene_ID'].isin(index_ec_genes)] # Filter to retain only EC genes
                transition_mtrx = create_transition_matrix(frm_expt_lst,to_expt_lst)
                transition_mtrx = smooth_transition_mtrx(filter_size, transition_mtrx)
                shifted_org = abs(transition_mtrx.min(axis=1, keepdims=True))+1
                transition_mtrx = transition_mtrx+shifted_org

                # Convert matrix to probability
                shifted_org_prob = shifted_org/transition_mtrx.sum(axis=1, keepdims=True)
                prob_mtrx = transition_mtrx/transition_mtrx.sum(axis=1, keepdims=True)

                # Calculate energy and reshift the origin
                energy = -np.log(prob_mtrx)
                energy_org = -np.log(shifted_org_prob)
                energy = energy-energy_org # subtraction as the energies are always positive due to negative log
            
                if filter_size == 3 and save_img:
                    plot_energy_landscape(energy,chromosome_output_folder,cross_grp)
                    
                area_ut,area_lt = plot_auc_2D(energy)
                
                area_data_ut.append(area_ut)
                area_data_lt.append(area_lt)
                area_data_diff.append(area_ut-area_lt)
                col_label.append(cross_grp)
            # Create dataframe from the list of lists
            appended_area.append(area_data_ut+area_data_lt+area_data_diff)

        area_data_df = pd.DataFrame(appended_area)
        area_data_df.columns = ["filter_size"]+[s + '_ut' for s in col_label]+[s + '_lt' for s in col_label]+[s + '_diff' for s in col_label]
        area_data_df.to_csv(os.path.join(chromosome_output_folder,"AUC_energy_df_chromosome_"+str(chromosome)+".csv"),index=False)