### Imports and loading config data

In [None]:
import pandas as pd
import math
from sklearn import mixture
import pymannkendall as mk
import numpy as np
import yaml
import os
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
import statistics as stats
from scipy.spatial import distance
from scipy.cluster import hierarchy
import itertools
from sklearn import linear_model
import statsmodels.api as sm
from matplotlib.lines import Line2D
from matplotlib.collections import PolyCollection
from scipy.cluster.hierarchy import dendrogram

import warnings
warnings.filterwarnings('ignore')

In [None]:
with open('params_icgcl.yaml', 'rb') as f:
    conf = yaml.safe_load(f.read())    # load the config file

# plot settings
sns.set_style(style='white')

plt.rc('text', usetex = True)
plt.rc('font', **{'family' : "sans-serif"})
plt.rc('text.latex', preamble=r"\usepackage{amsmath}"
           r"\usepackage{amstext}")
plt.rcParams["axes.linewidth"] = 2.50
plt.rcParams['xtick.major.size'] = 20
plt.rcParams['ytick.major.size'] = 20
plt.rcParams['text.usetex'] =  True
fsz = 28

In [None]:
# Load general settings from the config file
save_img = conf['settings']['save_img']
dataset_config = conf["datasets"]["Fleischer"]

# Load dataset-specific parameters
input_file_metadata = dataset_config['input_file_metadata']
ec_hc_mask_folder = dataset_config['ec_hc_mask_folder']
input_file = dataset_config["input_file"]
input_folder = dataset_config["input_folder"]
output_folder = dataset_config["output_folder"]
rep_list = dataset_config["rep_list"]

### Data Preprocessing

In [None]:
def data_preprocessing(df,expt_list):
    normalized_df = df[['Gene_ID','StartPos','EndPos']]
    for expt in expt_list:
        normalized_df[expt] = (df[expt])/  df[expt].sum()
    normalized_df = normalized_df.reset_index()
    return  normalized_df

### Plotting Function

In [None]:
def setAlpha(ax,a):
    for art in ax.get_children():
        if isinstance(art, PolyCollection):
            art.set_alpha(a)

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    fig, ax = plt.subplots()
    fig.set_size_inches(9,6)
    fig.set_dpi(150)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.xlabel("Experiment",fontsize = fsz+6, color='k')
        plt.ylabel("Distance",fontsize = fsz+6, color='k')
        plt.tick_params("y",labelsize=fsz-6)
        plt.tick_params("x",labelsize=3.5)
        plt.tick_params(axis=u'both', which=u'both',length=0)
        plt.xlabel('Age (color coded by discretization)')
        plt.ylabel('Distance')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

In [None]:
# df_sort = = pd.read_csv(os.path.join(output_folder,'fpkm_data_SVD.csv')) # T
df_sort = pd.read_csv(os.path.join(output_folder,"auto_cor_ratio.csv"))
df = df_sort.sort_values('chromosome_id')
df = df.set_index('chromosome_id')
df = df.T
Z = hierarchy.linkage(df, method='average',metric='cityblock',optimal_ordering=True) # Use it to cluster autocorr
# Z = hierarchy.linkage(sp.distance.squareform(df), method='average',metric='cityblock',optimal_ordering=True) # Use it to cluster PCA and SVD data

In [None]:
D_leaf_colors = {
"101_19YR_F": "c",
"102_19YR_M": "c",
"103_19YR_M": "c",
"104_19YR_M": "c",
"105_20YR_M": "c",
"106_20YR_F": "c",
"14_1YR_M": "c",
"15_12YR_M": "c",
"155_22YR_M": "g",
"156_25YR_F": "g",
"158_24YR_M": "g",
"159_23YR_M": "g",
"162_21YR_M": "g",
"16_24YR_F": "g",
"17_25YR_M": "g",
"18_25YR_F": "g",
"79_1YR_M": "c",
"80_2YR_F": "c",
"81_3YR_M": "c",
"82_3YR_M": "c",
"83_5YR_M": "c",
"84_6YR_M": "c",
"85_7YR_M": "c",
"86_7YR_M": "c",
"87_8YR_M": "c",
"88_8YR_M": "c",
"89_9YR_F": "c",
"90_10YR_M": "c",
"91_10YR_M": "c",
"92_11YR_F": "c",
"93_11YR_F": "c",
"94_12YR_M": "c",
"95_13YR_M": "c",
"97_16YR_F": "c",
"98_17YR_M": "c",
"99_17YR_M": "c",
"107_31YR_F": "g",
"108_31YR_F": "g",
"109_32YR_M": "g",
"110_32YR_F": "g",
"111_33YR_M": "g",
"112_33YR_M": "g",
"114_37YR_F": "g",
"115_37YR_F": "g",
"117_37YR_M": "g",
"118_39YR_M": "g",
"157_29YR_M": "g",
"160_26YR_F": "g",
"19_26YR_M": "g",
"20_26YR_F": "g",
"21_28YR_M": "g",
"22_29YR_M": "g",
"23_29YR_M": "g",
"24_30YR_M": "g",
"25_30YR_F": "g",
"26_30YR_M": "g",
"27_30YR_M": "g",
"28_43YR_F": "r",
"29_44YR_M": "r",
"30_41YR_M": "r",
"31_47YR_M": "r",
"32_50YR_F": "r",
"33_46YR_M": "r",
"34_50YR_M": "r",
"35_42YR_F": "r",
"36_47YR_M": "r",
"37_41YR_F": "r",
"38_43YR_M": "r",
"39_44YR_M": "r",
"40_45YR_M": "r",
"41_46YR_M": "r",
"42_46YR_M": "r",
"43_47YR_M": "r",
"120_51YR_M": "r",
"121_52YR_M": "r",
"122_55YR_M": "r",
"123_57YR_M": "r",
"124_60YR_M": "r",
"125_71YR_F": "m",
"126_75YR_M": "m",
"163_67YR_F": "m",
"164_67YR_F": "m",
"165_68YR_M": "m",
"167_66YR_M": "m",
"168_68YR_M": "m",
"169_70YR_M": "m",
"170_69YR_M": "m",
"44_61YR_M": "m",
"45_62YR_F": "m",
"46_62YR_F": "m",
"47_63YR_M": "m",
"48_64YR_M": "m",
"49_66YR_M": "m",
"50_67YR_M": "m",
"51_67YR_M": "m",
"52_68YR_M": "m",
"53_68YR_M": "m",
"54_69YR_M": "m",
"55_69YR_F": "m",
"127_78YR_M": "m",
"128_80YR_F": "m",
"129_94YR_M": "C0",
"130_89YR_M": "C0",
"131_87YR_M": "C0",
"132_90YR_M": "C0",
"133_89YR_M": "C0",
"134_86YR_M": "C0",
"135_92YR_M": "C0",
"136_87YR_M": "C0",
"166_82YR_F": "C0",
"56_83YR_M": "C0",
"57_83YR_M": "C0",
"58_83YR_M": "C0",
"59_84YR_F": "C0",
"60_84YR_M": "C0",
"61_84YR_F": "C0",
"62_84YR_M": "C0",
"63_84YR_M": "C0",
"64_84YR_M": "C0",
"65_84YR_M": "C0",
"66_86YR_M": "C0",
"67_86YR_M": "C0",
"68_86YR_M": "C0",
"69_87YR_F": "C0",
"70_87YR_M": "C0",
"71_87YR_M": "C0",
"72_87YR_M": "C0",
"73_88YR_M": "C0",
"74_89YR_M": "C0",
"75_90YR_M": "C0",
"76_91YR_M": "C0",
"77_92YR_F": "C0",
"78_96YR_M": "C0",}

In [None]:
# notes:
# * rows in Z correspond to "inverted U" links that connect clusters
# * rows are ordered by increasing distance
# * if the colors of the connected clusters match, use that color for link
link_cols = {}

labels = df.index
dflt_col = "#808080"
link_cols = {}
for i, i12 in enumerate(Z[:,:2].astype(int)):
  c1, c2 = (link_cols[x] if x > len(Z) else D_leaf_colors[labels[x]]
    for x in i12)
  link_cols[i+1+len(Z)] = c1 if c1 == c2 else dflt_col
labels = [sub.replace('_M', '') for sub in labels]
labels = [sub.replace('_F', '') for sub in labels]

In [None]:
dn = fancy_dendrogram(Z, labels=labels, leaf_rotation=90,color_threshold=.00, link_color_func=lambda x: link_cols[x])

ax = plt.gca()
ax.grid(False)
df_list_treat = {('0','20'): 'c', ('21','40'): 'g', ('41','60'): 'r', ('61','80'): 'm', ('81','100'): 'C0'}
labels = ['Age group: '+ list(df_list_treat.items())[0][0][0]+'-'+list(df_list_treat.items())[0][0][1],
        'Age group: '+ list(df_list_treat.items())[1][0][0]+'-'+list(df_list_treat.items())[1][0][1],
        'Age group: '+ list(df_list_treat.items())[2][0][0]+'-'+list(df_list_treat.items())[2][0][1],
        'Age group: '+ list(df_list_treat.items())[3][0][0]+'-'+list(df_list_treat.items())[3][0][1],
        'Age group: '+ list(df_list_treat.items())[4][0][0]+'-'+list(df_list_treat.items())[4][0][1]]

legend_elements = [Line2D([0], [0],markerfacecolor='None',markeredgecolor='None',color = list(df_list_treat.items())[0][1], label=labels[0],markersize=15),
                        Line2D([0], [0],markerfacecolor='None',markeredgecolor='None',color = list(df_list_treat.items())[1][1], label=labels[1],markersize=15),
                        Line2D([0], [0],markerfacecolor='None',markeredgecolor='None',color = list(df_list_treat.items())[2][1], label=labels[2],markersize=15),
                        Line2D([0], [0],markerfacecolor='None',markeredgecolor='None',color = list(df_list_treat.items())[3][1], label=labels[3],markersize=15),
                       Line2D([0], [0],markerfacecolor='None',markeredgecolor='None',color = list(df_list_treat.items())[4][1], label='Age group: $>=$81',markersize=15)]
leg = ax.legend(handles=legend_elements,ncol=5, markerscale=3, loc='upper right', borderaxespad=0.05, fontsize = fsz+26, bbox_to_anchor=(0.5, 1.5))
leg = ax.legend(handles=legend_elements,markerscale=2, loc='upper right', borderaxespad=0.05, fontsize = fsz-12)
for i in leg.legendHandles:
    i.set_linewidth(5)
x_labels = ax.get_xmajorticklabels()
for x in x_labels:
    for key in df_list_treat:
        key_1 = int(key[0])
        key_2 = int(key[1])
        x_split =x.get_text().split('_')[1]
        s=''
        for i in x_split:
            if i.isdigit():
                s=s+i
        x_split = int(s)
        if key_1<=x_split<=key_2:
           x.set_color(df_list_treat[key])
           break
# For every axis, set the x and y major locator
# ax.xaxis.set_major_locator(plt.MaxNLocator(1))
ax.yaxis.tick_right()
ax.yaxis.set_label_position("right")
plt.tight_layout()
# plt.savefig(os.path.join(output_folder,'dendrogram_PCA_sqtranform_withX_1.pdf'),format='pdf', bbox_inches='tight')

### Postprocessing and Analysis

In [None]:
# Post processing data analysis beyond this point. Importing the output l* file saved in output_folder directory
df = pd.read_csv(os.path.join(output_folder,"auto_cor_ratio.csv"))
df_chromosome_dist = pd.DataFrame(df.chromosome_id)

In [None]:
for rep_entry in rep_list:
    col_name = ''
    if any('19YR' in rep_name for rep_name in rep_entry):
        col_name = '0-20'
    if any('31YR' in rep_name for rep_name in rep_entry):
        col_name = '21-40'
    if any('51YR' in rep_name for rep_name in rep_entry):
        col_name = '41-60'
    if any('71YR' in rep_name for rep_name in rep_entry):
        col_name = '61-80'
    if any('91YR' in rep_name for rep_name in rep_entry):
        col_name = '>=81'
    df_chromosome_dist[col_name]=df[rep_entry].mean(axis=1)
expt_order = ['0-20', '21-40','41-60','61-80','>=81']

In [None]:
## Fig 4a
df_chromosome_dist_melted_df = pd.melt(df_chromosome_dist,id_vars='chromosome_id',value_vars=list(df_chromosome_dist.columns))
df_chromosome_dist_melted_df=df_chromosome_dist_melted_df.rename(columns={'value':'lstar'})
df_chromosome_dist = df_chromosome_dist.set_index('chromosome_id')
df_chromosome_dist_viol = pd.DataFrame(index=df_chromosome_dist.index)
for col in df_chromosome_dist.columns:
    df_chromosome_dist_viol[col] = (df_chromosome_dist[col]).astype(float)
df_chromosome_dist_viol = df_chromosome_dist_viol.reset_index()
df_chromosome_dist_viol_melted_df = pd.melt(df_chromosome_dist_viol,id_vars='chromosome_id',value_vars=expt_order)
df_chromosome_dist_viol_melted_df = df_chromosome_dist_viol_melted_df.rename(columns={'value':'lstar'})
fig, ax = plt.subplots()
fig.set_size_inches(12, 10)
fig.set_dpi(300)
# Let's plot the strip plot to check the temp values
ax = sns.boxplot(data=df_chromosome_dist_viol_melted_df, x="variable", y="lstar",palette=['c','g','r','m','C0'], dodge = True, linewidth=2, medianprops=dict(linestyle='None'),meanline=True, showmeans=True, meanprops=dict(linestyle='-',linewidth = 2, color='black'), boxprops=dict(alpha=.6))
ax.set_xticks(range(len(df_chromosome_dist.columns))) # <--- set the ticks first
ax.set_xticklabels(['0-20','21-40','41-60','61-80','$>=$81'])
setAlpha(ax,0.6)
ax.grid(False)
ax.set_ylabel('$\ell^{*}$', fontsize=fsz+10, color='k')
ax.set(xlabel=None)
ax.tick_params('both',which='major', length=7,labelsize=fsz)
ax.tick_params('both',which='minor', length=7,labelsize=fsz)
plt.xticks(rotation=45)
plt.show()
# plt.tight_layout()
# plt.savefig(os.path.join(output_folder,("box_plot_ell_age_Fleischer_dbscan"+".pdf")), bbox_inches='tight')

In [None]:
# Let's perform Mann Kendall test for the above data sets to identify the trend of the curves
y_data = []
for value in df_chromosome_dist_viol_melted_df['variable'].unique():
    subset = df_chromosome_dist_viol_melted_df[df_chromosome_dist_viol_melted_df['variable'] == value]
    y_data.append(subset['lstar'].mean()) # Using it mean first and then we can test for median    
result = mk.original_test(y_data)
print(result)

In [None]:
## Fig 4b
# Now we can melt the data
df = df.set_index('chromosome_id')
df_chromosome_dist_viol = pd.DataFrame(index=df.index)
for col in df.columns:
    df_chromosome_dist_viol[col] = (df[col]).astype(float) 
df_chromosome_dist_viol = df_chromosome_dist_viol.reset_index()
df_chromosome_dist_viol_melted_df = pd.melt(df_chromosome_dist_viol, id_vars='chromosome_id')
df_chromosome_dist_viol_melted_df = df_chromosome_dist_viol_melted_df.rename(columns={'value': 'lstar'})
# Let's now replace lists with their respective age
df_chromosome_dist_viol_melted_df['Age'] = df_chromosome_dist_viol_melted_df['variable'].str.extract(r'(\d+)(?=YR)')

In [None]:
# Using the linear regression model to measure the linear relationship between age and ell*
features = ['Age']
target = 'lstar'
df_chromosome_dist_viol_melted_df['chromosome_id'] = df_chromosome_dist_viol_melted_df['chromosome_id'].replace('X', 23)
X = df_chromosome_dist_viol_melted_df[features].values.reshape(-1, len(features))
y = df_chromosome_dist_viol_melted_df[target].values
ols = linear_model.LinearRegression()
model = ols.fit(X, y)

X = [int(item[0]) for item in X]
X2 = sm.add_constant(X)
model1 = sm.OLS(y, X2).fit()
print(model1.summary())

In [None]:
y_data = {}
y_data_std = {}
for value in df_chromosome_dist_viol_melted_df['Age'].unique():
    subset = df_chromosome_dist_viol_melted_df[df_chromosome_dist_viol_melted_df['Age'] == value]
    y_data[value] = subset['lstar'].mean()
    y_data_std[value] = subset['lstar'].std()

sorted_data = {k: y_data[k] for k in sorted(y_data, key=int)}
sorted_std = {k: y_data_std[k] for k in sorted(y_data_std, key=int)}
sorted_keys = list(sorted_data.keys())
values = list(sorted_data.values())
values_std = list(sorted_std.values())
result = values
print(mk.original_test(result))

In [None]:
x_pred = np.linspace(0, len(sorted_keys)+1, 20)     
x_pred_1 = np.linspace(0, 98, 20)     
x_pred = x_pred.reshape(-1, len(features))  
x_pred_1 = x_pred_1.reshape(-1, len(features))  
y_pred = model.predict(x_pred)
y_pred_1 = model.predict(x_pred_1)

In [None]:
# Function to find the color based on the group
def get_color(label, group_colors):
    for group, color in group_colors.items():
        if int(group[0]) <= label <= int(group[1]):
            return color
    return 'black'

In [None]:
# Plot th sorted keys against their corresponding values
fig, axs = plt.subplots()
fig.set_size_inches(14, 7)
fig.set_dpi(300)

axs.plot(sorted_keys,result,'ok',linewidth = 3)
axs.errorbar(sorted_keys, values, yerr=values_std, fmt='o', markersize=8,markeredgecolor='k', capsize=10, color="k", elinewidth=2, markeredgewidth=2)
axs.plot(x_pred, y_pred, color='k',linewidth = 2, label='Regression model')
plt.xlabel('Age of the donor', fontsize=fsz, color='k')
plt.ylabel(r'$\ell^{*}$', size=fsz+4)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
axs.tick_params('both',which='major', length=7,labelsize=fsz)
axs.tick_params('both',which='minor', length=7,labelsize=fsz)
axs.tick_params(axis='x', rotation=90, labelsize=fsz-10)

# Example groupings and corresponding colors
group_colors = {
    ('0', '20'): 'c',
    ('21', '40'): 'g',
    ('41', '60'): 'r',
    ('61', '80'): 'm',
    ('81', '100'): "C0"
}
sorted_keys = list(map(int, sorted_keys))
# Set the color for each tick label based on the group
for key, i in enumerate(sorted_keys):
    color = get_color(i, group_colors)
    axs.get_xticklabels()[key].set_color(color)
plt.show()
plt.tight_layout()
# plt.savefig(os.path.join(output_folder,'l_star_vs_age_Fleischer_ec_gamma3.pdf'), format='pdf', bbox_inches='tight')

In [None]:
# # Plot
# fig, axs = plt.subplots(figsize=(14, 7), dpi=300)
# 
# # Plot only points that have values (ignore zeros)
# axs.plot(sorted_keys,result, 'ok', linewidth=3)
# axs.errorbar(sorted_keys, values, yerr=values_std, fmt='o', 
#              markersize=8, markeredgecolor='k', capsize=10, color="k", 
#              elinewidth=2, markeredgewidth=2)
# 
# # X-axis labels (show only every 10th tick)
# axs.set_xticks(np.arange(0, 101, 10))
# axs.set_xticklabels(np.arange(0, 101, 10), fontsize=12)
# 
# axs.plot(x_pred, y_pred, color='k',linewidth = 2, label='Regression model')
# plt.xlabel('Age of the donor', fontsize=fsz, color='k')
# plt.ylabel(r'$\ell^{*}$', size=fsz+4)
# plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
# axs.tick_params('both',which='major', length=7,labelsize=fsz)
# axs.tick_params('both',which='minor', length=7,labelsize=fsz)
# axs.tick_params(axis='x', rotation=45, labelsize=fsz-6)
# 
# # Example groupings and corresponding colors
# plt.show()
# plt.tight_layout()

In [None]:
# Define group colors based on age ranges
group_colors = {
    ('0', '20'): 'c',
    ('21', '40'): 'g',
    ('41', '60'): 'r',
    ('61', '80'): 'm',
    ('81', '100'): "C0"
}

# Function to get color for an age group
def get_color(age, group_colors):
    for (low, high), color in group_colors.items():
        if int(low) <= age <= int(high):
            return color
    return 'k'  # Default color if none match

# Plot
fig, axs = plt.subplots()
fig.set_size_inches(14, 7)
fig.set_dpi(300)
# Plot the points
axs.plot(sorted_keys, values, 'ok', linewidth=3)

# Add error bars with colors based on group
for i, key in enumerate(sorted_keys):
    color = get_color(key, group_colors)
    axs.errorbar(key, values[i], yerr=values_std[i], fmt='o', markersize=8, 
                 markeredgecolor='k', capsize=10, color=color, 
                 elinewidth=2, markeredgewidth=2)

# X-axis labels (show only every 10th tick)
axs.set_xticks(np.arange(0, 101, 10))
axs.set_xticklabels(np.arange(0, 101, 10), fontsize=12)

axs.plot(x_pred_1, y_pred_1, color='k',linewidth = 2, label='Regression model')
plt.xlabel('Age of the donor', fontsize=fsz, color='k')
plt.ylabel(r'$\ell^{*}$', size=fsz+4)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
axs.tick_params('both',which='major', length=7,labelsize=fsz)
axs.tick_params('both',which='minor', length=7,labelsize=fsz)
axs.tick_params(axis='x', rotation=45, labelsize=fsz-6)

# Example groupings and corresponding colors
# plt.show()
plt.tight_layout()
plt.savefig(os.path.join(output_folder,'l_star_vs_age_Fleischer_ec_gamma3.pdf'), format='pdf', bbox_inches='tight')

In [None]:
# Reading the files containing HAT and LAT genes. The genes labeled 1 are HATs and genes labeled 0 are LATs
df_ec_hc = pd.DataFrame()
counter = 0
for chromosome_dir in os.listdir(ec_hc_mask_folder):
    f = os.path.join(ec_hc_mask_folder, chromosome_dir)
    if os.path.isdir(f):
        for filename in os.listdir(f):
            if filename.endswith("_ec_hc_mask_xy.csv"):  # Check for files that match the pattern
                file_path = os.path.join(f, filename)  # Get the full path of the file
                df_ec_hc_temp = pd.read_csv(file_path)
                df_ec_hc_temp.drop(['StartPos','EndPos'],axis=1,inplace=True)
                df_ec_hc_temp = df_ec_hc_temp.set_index('Gene_ID')
                if counter == 0:
                    df_ec_hc = df_ec_hc_temp
                else:
                    df_ec_hc = df_ec_hc.add(df_ec_hc_temp, fill_value=0)
                counter = counter+1
df_ec_hc['sum'] = df_ec_hc.sum(axis=1)
df_ec_hc = df_ec_hc[df_ec_hc['sum']>0]
index_ec_genes = list(df_ec_hc.index)

In [None]:
## Fig 3c
expt_list = [item for sublist in rep_list for item in sublist]
df.sort_values(by=['chromosome_id'],inplace=True)
df_vec = df.reset_index(drop=True)
df_list = df_vec.loc[:, df.columns != 'chromosome_id'].to_numpy()
dist_df = pd.DataFrame(columns= ['reference'] + expt_list)
ref_list = rep_list[0]
dist_df['reference'] = ref_list
for i in range(0,len(rep_list)):
    for rep1, rep2 in itertools.product(ref_list,rep_list[i]):
        dist = distance.cityblock(df_vec[rep1],df_vec[rep2])
        dist_df.loc[dist_df['reference'] == rep1,rep2] = dist
label_list = ['Grp_1_20','Grp_21_40','Grp_41_60','Grp_61_80','Grp_81_100']
# Setting up the mean and standard deviation plot
dist_list = []
dist_list_std = []
dist_list_df = pd.DataFrame(columns= label_list)
dist_list_df_std = pd.DataFrame(columns= label_list)
for rep in rep_list:
    dist_list.append(np.average(dist_df[rep].mean().to_numpy()))
    dist_list_std.append(np.std(dist_df[rep].to_numpy()))

dist_list_df = dist_list_df.append(pd.DataFrame([dist_list], columns= label_list), ignore_index=True)
dist_list_df_std = dist_list_df_std.append(pd.DataFrame([dist_list_std], columns= label_list), ignore_index=True)
# Setting up the plotting tool
df_sim_auto_corr = pd.concat([dist_list_df,dist_list_df_std])
df_sim_auto_corr = df_sim_auto_corr.T.reset_index()
df_sim_auto_corr.columns = ['Expt','mean','std']
x_col = ['Grp_1_20','Grp_21_40','Grp_41_60','Grp_61_80','Grp_81_100']
df_sim_auto_corr['Expt'] = pd.Categorical(df_sim_auto_corr.Expt, categories=x_col, ordered=True)
df_sim_auto_corr =df_sim_auto_corr.sort_values('Expt')
## Let's save these values as well that is required to do the plotting
# df_sim_auto_corr.to_csv(os.path.join(output_folder,"pair_wise_similarity_autocorr.csv"),index=False)

In [None]:
# Let's test another algorithm
df = pd.read_csv(input_file)
result = defaultdict(list)
dist_list = []
dist_list_std = []

for i in range(0,len(rep_list)):
    result = []
    for rep1, rep2 in itertools.product(ref_list,rep_list[i]):
        dist = 0
        counter = 0
        for chromosome,data in df.groupby('Chromosome'):
            if chromosome.isalnum() and chromosome != 'MT':
                counter = counter + 1
                normalized_df = data_preprocessing(data,[rep1,rep2])
                normalized_df = normalized_df[normalized_df['Gene_ID'].isin(index_ec_genes)].reset_index(drop=True) # Filter out EC genes
                # Let's calculate the similarity distance between the two distributions
                dist = dist + distance.cityblock(normalized_df[rep1],normalized_df[rep2])
        avg_dist = dist/counter
        result.append(avg_dist)
    dist_list.append(stats.mean(result))
    dist_list_std.append(stats.stdev(result))
    

In [None]:
# Setting up the saving the data
df_sim_exp = pd.DataFrame([dist_list,dist_list_std],columns = label_list)
df_sim_exp = df_sim_exp.T.reset_index()
df_sim_exp.columns = ['Expt','mean','std']
x_col = ['Grp_1_20','Grp_21_40','Grp_41_60','Grp_61_80','Grp_81_100']
df_sim_exp['Expt'] = pd.Categorical(df_sim_exp.Expt, categories=x_col, ordered=True)
df_df_sim_exp = df_sim_exp.sort_values('Expt')
## Let's save these values as well that is required to do the plotting
# df_df_sim_exp.to_csv(os.path.join(output_folder,"pair_wise_similarity_expression.csv"),index=False)

In [None]:
y_exp = df_sim_exp['mean'].to_list()
yerr_exp = df_sim_exp['std'].to_list()
y_autocorr = df_sim_auto_corr['mean'].to_list()
yerr_autocorr = df_sim_auto_corr['std'].to_list()
x_label = df_sim_auto_corr['Expt'].to_list()

x1 = [1,2,3,4,5]
# Dictionary mapping original labels to new labels
label_mapping = {
    'Grp_1_20': '0-20',
    'Grp_21_40': '21-40',
    'Grp_41_60': '41-60',
    'Grp_61_80': '61-80',
    'Grp_81_100': '81-100'
}
# Updating the list using the mapping
x_label = [label_mapping[group] for group in x_label]
# Setting up the plotting tool
fsz = 28
fig, ax = plt.subplots()
fig.set_size_inches(12, 8)
fig.set_dpi(300)
plt.plot(x1,y_exp,'-b',linewidth = 3)
plt.errorbar(x1, y_exp, yerr=yerr_exp, fmt='o', markersize=8, capsize=10,markeredgecolor='b', color="b", elinewidth=2, markeredgewidth=2)
plt.xticks(rotation=45)
ax1 = ax.twinx()
ax1.plot(x1,y_autocorr,'-r',linewidth = 3)
ax1.errorbar(x1, y_autocorr, yerr=yerr_autocorr, fmt='o', markersize=8,markeredgecolor='r', capsize=10, color="r", elinewidth=2, markeredgewidth=2)

ax.set_ylabel(r'$\mathcal{D}(\alpha_t)$', fontsize=fsz+10, color='k')
ax1.set_ylabel(r'$\mathcal{D}(\ell^{*})$', fontsize=fsz+10, color='k')
ax.set_xlim(left=0.9,right=5.1)
ax1.set_ylim(bottom=0,top=70)
ax.set_ylim(bottom=0,top=0.6)

ax1.locator_params(axis = 'y', nbins=5)
ax.locator_params(axis = 'y', nbins=5)

ax.set_xticks(x1)
ax.set_xticklabels(x_label)

# Setting the color for the axis
ax.yaxis.label.set_color('b')
ax1.spines["left"].set_edgecolor('b')
ax.tick_params(axis='y', colors='b')
ax1.yaxis.label.set_color('r')
ax1.spines["right"].set_edgecolor('r')
ax1.tick_params(axis='y', colors='r')
ax.tick_params('both',which='major', length=7,labelsize=fsz)
ax.tick_params('both',which='minor', length=7,labelsize=fsz)
ax1.tick_params('both',which='major', length=7,labelsize=fsz)
ax1.tick_params('both',which='minor', length=7,labelsize=fsz)
plt.tight_layout()
# plt.savefig(os.path.join(output_folder,("pair_wise_similarity_Fleischer"+".pdf")), bbox_inches='tight')

In [None]:
## Fig S11
df =  pd.read_csv(os.path.join(output_folder,"auto_cor_ratio.csv"))
df_md = pd.read_csv(input_file_metadata)
## Setting up the sorting code for chromosome Id
sorter = [str(x) for x in range(1,23)]
sorter.append('X')
sorter.append('Y')
sorterIndex = dict(zip(sorter, range(len(sorter))))
df.sort_values(by=['chromosome_id'], key=lambda x: x.map(sorterIndex), inplace = True)
## Let's combine the information of sample using metadata and plot it
chromosomes=df['chromosome_id']
df1 = df.iloc[:,1:]
df_final = pd.DataFrame()
for samplename in df1.columns.tolist():
    tmpdf = pd.DataFrame(data={'lstar' : df1[samplename],'sampleNames':samplename,'chromosome':chromosomes})
    df_final = pd.concat([df_final, tmpdf], ignore_index=True)
df_final = df_md.merge(df_final)
# Let's write the analysis to plot a bar plot for different chromosomes one on top of each other
df_top = df_final[df_final['chromosome'].isin(sorter[0:12])]
df_down = df_final[df_final['chromosome'].isin(sorter[12:])]
fig, axes = plt.subplots(2, 1, figsize=(22, 8))
custom_palette = sns.color_palette(["c", "g", "r", "m", "C0"])
sns.barplot(ax = axes[0],data = df_top,x='chromosome',y='lstar',hue='age_group',
            palette=custom_palette,
            capsize = 0.05,             
            saturation = 8,    
            alpha = 0.8,
            errcolor = 'gray', errwidth = 2,  
            ci = 'sd'   
            )
axes[0].set_ylim(0, 22)
axes[0].set_ylabel('$\ell^{*}$', fontsize=fsz+10, color='k')
axes[0].set_xlabel('chromosome', fontsize=fsz+10, color='k')
axes[0].tick_params('both',which='major', length=7,labelsize=fsz)
axes[0].tick_params('both',which='minor', length=7,labelsize=fsz)
axes[0].tick_params(axis=u'both', which=u'both',length=0)
axes[0].legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = fsz-10)
sns.barplot(ax = axes[1],data = df_down,x='chromosome',y='lstar',hue='age_group',
            palette = custom_palette,
            capsize = 0.05,             
            saturation = 8,
            alpha = 0.8,
            errcolor = 'gray', errwidth = 2,  
            ci = 'sd'   
            )
axes[1].set_ylim(0, 22)
axes[1].legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = fsz-10)
axes[1].set_ylabel('$\ell^{*}$', fontsize=fsz+10, color='k')
axes[1].set_xlabel('chromosome', fontsize=fsz+10, color='k')
axes[1].tick_params('both',which='major', length=7,labelsize=fsz)
axes[1].tick_params('both',which='minor', length=7,labelsize=fsz)
axes[1].tick_params(axis=u'both', which=u'both',length=0)
# plt.savefig(os.path.join(output_folder,'Fig1_chromosome_lstar_Fleischer.pdf'), format='pdf', bbox_inches='tight')
plt.show()

In [None]:
## Fig S2 c and d
def fit_distribution_2d_comp(normalized_df, expt_name):
    # Create subplots
    fig, axs = plt.subplots(1, len(expt_name), figsize=(14, 6), sharex=True, sharey=True)
    i = 0
    title_label = ["51YR","94YR"]
    for expt in expt_name:
        print(expt)
        m12_gid = mask_6.loc[mask_6[expt] == 1]['Gene_ID'].tolist()
        temp = normalized_df.loc[normalized_df.Gene_ID.isin(m12_gid)]
        dpgmm = mixture.BayesianGaussianMixture(n_components=10, covariance_type='full', random_state=10)
        temp['cluster_' + expt] = dpgmm.fit_predict(temp[['index', expt]])

        gmm = mixture.GaussianMixture(n_components=len(set(temp['cluster_' + expt])), covariance_type='full').fit(temp[['index', expt]])
        x_min = normalized_df['index'].min()
        x_max = normalized_df['index'].max()

        x_range = np.linspace(x_min, x_max, len(normalized_df))
        y_range = np.linspace(normalized_df[expt].min(), normalized_df[expt].max(), len(normalized_df))
        X, Y = np.meshgrid(x_range, y_range)

        pdf = np.exp(gmm._estimate_log_prob(np.column_stack([X.ravel(), Y.ravel()])))
        overall_pdf = np.sum(pdf, axis=1)

        # Determine the position in the 2x2 grid
        ax = axs[i]

        # Plot the distributions
        ax.plot(x_range, overall_pdf[:len(x_range)], linestyle='--', linewidth=2)
        ax.plot(x_range, pdf[:len(x_range)])
        ax.set_title(title_label[i], fontsize=fsz - 8)
        ax.tick_params('both',which='major', length=7,labelsize=fsz-8)
        ax.tick_params('both',which='minor', length=7,labelsize=fsz-8)
        i = i + 1

    fig.text(0.5, 0.04, 'Genomic coordinates sorted by their position', ha='center', fontsize=fsz-10, color='k')
    fig.text(0.04, 0.5, 'Probability distribution of gene expression', va='center', rotation='vertical', fontsize=fsz-10, color='k')

    plt.tight_layout(rect=[0.05, 0.05, 1, 0.95])
    # plt.savefig(os.path.join(output_folder,("P_dist_genomic_coord_chrom6_gamma3"+".pdf")), bbox_inches='tight')
    plt.show()

In [None]:
def create_ranges(diff):
    ranges = []
    start = 1
    end = diff
    max = 100
    while end <= max: # assuming the upper limit is 100
        ranges.append((start, end))
        start = end + 1 # Change it to 1
        end = start + diff - 1 # Uncomment this before 1
    if start <= max:
        ranges.append((start, max))
    return ranges

In [None]:
def create_rep_list(expt_list,dx):
    ranges = create_ranges(dx)
    sublists = [[] for _ in range(math.ceil(100/dx))]  # create empty sublists for each age range
    for item in expt_list:
        age_range = int(item.split('_')[1].replace('YR', ''))
        for i in range(len(ranges)):
            if ranges[i][0] <= age_range <= ranges[i][1]:
                sublists[i].append(item)
    return sublists

In [None]:
df = pd.read_csv(input_file)
mask_6 = pd.read_csv(os.path.join(ec_hc_mask_folder, "chromosome_6", 'chromosome_6_ec_hc_mask_xy.csv'))
dx = 20
rep_list = create_rep_list(expt_list, dx)

for chromosome, data in df.groupby('Chromosome'):
    expt_list = [item for sublist in rep_list for item in sublist]
    if (chromosome.isdigit() and int(chromosome) == 6):
        data.sort_values(by=['StartPos', 'EndPos'], inplace=True)
        chromosome_output_folder = os.path.join(output_folder, "chromosome_" + str(chromosome))
        Path(chromosome_output_folder).mkdir(parents=True, exist_ok=True)
        normalized_df = data_preprocessing(data, expt_list)
        normalized_df = normalized_df.drop(['index'], axis=1)
        normalized_df = normalized_df.reset_index()
        break

In [None]:
# expt_name_comp = ["99_17YR_M","120_51YR_M","126_75YR_M","129_94YR_M"]
expt_name_comp = ["120_51YR_M","129_94YR_M"]
fit_distribution_2d_comp(normalized_df,expt_name_comp)