In [None]:
import os
import pandas as pd
import numpy as np
import yaml
import copy
from sklearn.cluster import SpectralClustering, DBSCAN
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.sparse.linalg import svds
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import warnings 
warnings.filterwarnings('ignore')

### Load configuration file

In [None]:
### Load configuration file
with open('params_preprocess.yaml', 'rb') as f:
    conf = yaml.safe_load(f.read())

fsz = 28
# Load general settings from the config file
save_img = conf['settings']['save_img']
active_dataset = conf["settings"]["active_dataset"]
dataset_config = conf["datasets"][active_dataset]

# Load dataset-specific parameters
input_file = dataset_config["input_file"]
output_folder = dataset_config["output_folder"]
gamma_value = dataset_config['gamma_value']
rep_list = dataset_config["rep_list"]

### Data preprocessing function

In [None]:
def data_preprocessing(df,expt_list):
    normalized_df = df[['Gene_ID','StartPos','EndPos']]
    for expt in expt_list:
        normalized_df[expt] = (df[expt])/  df[expt].sum()
    normalized_df = normalized_df.reset_index()
    return  normalized_df

In [None]:
def save_cluster(df, save_plot_path):
    fig, ax = plt.subplots()
    fig.set_size_inches(12, 10)
    fig.set_dpi(150)
    df['index'] = range(1, len(df)+1)
    df['log_index'] = np.log10(df['index'])
    palette = [ "#008000","#FF0000"]
    for i in df.columns[0:3]:
        sns.scatterplot(ax=ax, x="index", y=i, data=df, hue="labels",s=fsz+50,palette=sns.color_palette(palette, 3),linewidth=0, alpha = 0.7)

    ax.set_ylabel(r'$\alpha_t$', fontsize = fsz, color='k')
    ax.set_xlabel('gene expression rank ($\mathrm{r}_t$)', fontsize=fsz, color='k')
    ax.tick_params('both',which='major', length=7,labelsize=fsz)
    ax.tick_params('both',which='minor', length=7,labelsize=fsz)
    ax.set(xscale= 'log')
    ax.set(yscale= 'log')
    labels = ['High abundance transcript (HAT)','Low abundance transcript (LAT)']
    legend_elements = [Line2D([0], [0], marker='o',color = 'r', lw=0, markerfacecolor='r', label=labels[0],
                                  markersize=8),
                        Line2D([0], [0], marker='o',color = 'g', lw=0, markerfacecolor='g', label=labels[1],
                                  markersize=8)]

    ax.legend(handles=legend_elements,markerscale=2, loc='upper right', borderaxespad=0.05, fontsize = fsz-4)
    plt.savefig(save_plot_path)
    plt.close()

In [None]:
def cluster(df, expt_list):
    X = df.drop(['StartPos','EndPos','Gene_ID'], axis=1)
    df['labels'] = SpectralClustering(n_neighbors=100, assign_labels='discretize',
                                      random_state=123, gamma=gamma_value, n_clusters=2,
                                      affinity="laplacian", eigen_solver="arpack").fit_predict(X)
    if save_img:
        save_cluster(copy.deepcopy(df.apply(lambda x: x.sort_values(ascending=False).values)), os.path.join(chromosome_output_folder, expt_list[0]+"_LAT_HAT_cluster.png"))

    return df

In [None]:
def high_lo_preprocessing(df):
    X = df.drop(['StartPos','EndPos','Gene_ID'], axis=1)
    df['labels'] = DBSCAN(metric="manhattan").fit(X).labels_
    reg_df = df[df.labels != -1]
    abnormal_df = df[df.labels == -1]
    return reg_df, abnormal_df

In [None]:
def get_high_lo(df, expt_list):
    df = cluster(df, expt_list)
    df['mean_value'] = df[expt_list].mean(axis=1)
    temp = df.groupby('labels')['mean_value'].mean().rename(expt_list[0]).reset_index()
    hi_cluster_label = temp[temp[expt_list[0]] == temp[expt_list[0]].max()]['labels'].reset_index(drop=True)[0]
    lo_cluster_label = temp[temp[expt_list[0]] == temp[expt_list[0]].min()]['labels'].reset_index(drop=True)[0]
    df_hi = df[df.labels == hi_cluster_label]
    df_lo = df[df.labels == lo_cluster_label]
    print("Bucket sizes (HAT, LAT):", len(df_hi), len(df_lo))
    return df_hi, df_lo, hi_cluster_label, lo_cluster_label

### PCA and SVD computation functions

In [None]:
def extract_features_SVD(df,k):
    U, sigma, Vt = svds(df.to_numpy(),k=k)
    wv = pd.DataFrame(U*sigma) #N X k
    return wv

In [None]:
def extract_features_PCA(df):
    pca = PCA(n_components=10)
    principalComponents = pca.fit_transform(df)
    principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1','principal component 2','principal component 3','principal component 4','principal component 5','principal component 6','principal component 7','principal component 8','principal component 9','principal component 10'])
    return principalDf

In [None]:
def Compute_PCA_SVD(normalized_df, df_PCA, df_SVD, indices_names, count, k):
    red_normalized_df = normalized_df.drop(columns=['StartPos','EndPos','index'])
    df_t = red_normalized_df.set_index('Gene_ID').T.sort_index(ascending=True)
    df_t.columns = df_t.columns.astype(str)

    df_t_scaled = pd.DataFrame(df_t)
    
    principal_component = extract_features_PCA(df_t_scaled)
    principal_component = principal_component.set_index(df_t.index)
    principal_component.columns = principal_component.columns.astype(str)
    
    wv = extract_features_SVD(df_t,k)
    wv = wv.set_index(df_t.index)
    wv_scaled = StandardScaler().fit_transform(wv.values)
    wv_scaled = pd.DataFrame(wv_scaled, index=wv.index, columns=wv.columns)
    
    if count == 0:
        df_PCA = pd.DataFrame(cdist(principal_component, principal_component, 'cityblock'))
        df_SVD = pd.DataFrame(cdist(wv_scaled, wv_scaled, 'cityblock'))
        count = count+1
        indices_names = list(principal_component.index)
        
    else:
        temp_val = pd.DataFrame(cdist(principal_component, principal_component, 'cityblock'))
        df_PCA = df_PCA+temp_val
        temp_val = pd.DataFrame(cdist(wv_scaled, wv_scaled, 'cityblock'))
        df_SVD = df_SVD+temp_val
        count = count+1
    
    return df_PCA, df_SVD, indices_names, count

### Main processing pipeline to compute and store HAT/LAT data per chromosome

In [None]:

df = pd.read_csv(input_file)
df['Chromosome'] = df['Chromosome'].astype(str) ##
df.sort_values("Gene_ID", inplace=True)
df_PCA = pd.DataFrame()
df_SVD = pd.DataFrame()
indices_names=[]
k = 20 # SVD size
count = 0

In [None]:
## Flattening rep_list (list of lists) into a single list 
expt_list_flat = [item for sublist in rep_list for item in sublist] 

In [None]:

for chromosome, data in df.groupby('Chromosome'):
    if chromosome.isalnum() and chromosome not in ['MT', 'Y']:
        print("Processing chromosome:", chromosome)
        
        chromosome_output_folder = os.path.join(output_folder, "chromosome_" + str(chromosome))
        os.makedirs(chromosome_output_folder, exist_ok=True)
        data_hi_folder = os.path.join(chromosome_output_folder, "data_hi")
        data_lo_folder = os.path.join(chromosome_output_folder, "data_lo")
        os.makedirs(data_hi_folder, exist_ok=True)
        os.makedirs(data_lo_folder, exist_ok=True)

        normalized_df = data_preprocessing(data, expt_list_flat)
        # Calculate PCA of the RNAseq data
        df_PCA, df_SVD, indices_names, count = Compute_PCA_SVD(normalized_df, df_PCA, df_SVD, indices_names, count, k)

        counter = 0
        new_df = None

        for rep in rep_list:
            print("Processing replicate group:", rep)
            subset = copy.deepcopy(rep) + ['StartPos', 'EndPos', 'Gene_ID']
            subset_df = normalized_df[subset]
            reg_df, abnormal_df = high_lo_preprocessing(subset_df)
            data_hi, data_lo, hi_cluster_label, lo_cluster_label = get_high_lo(reg_df, rep)
            data_hi_file = os.path.join(data_hi_folder, f"chromosome_{chromosome}_rep_{rep[0]}_data_hi.csv")
            data_lo_file = os.path.join(data_lo_folder, f"chromosome_{chromosome}_rep_{rep[0]}_data_lo.csv")
            data_hi.to_csv(data_hi_file)
            data_lo.to_csv(data_lo_file)
            
            if counter == 0:
                new_df = pd.concat([data_hi, data_lo], axis=0)[['StartPos', 'EndPos', 'Gene_ID', 'labels'] + rep].reset_index(drop=True)
            else:  
                merge_df = pd.concat([data_hi, data_lo], axis=0)[['Gene_ID', 'labels'] + rep].reset_index(drop=True)
                new_df = pd.merge(new_df, merge_df, how='outer', on='Gene_ID')
            new_df.sort_values(by=['StartPos', 'EndPos'], inplace=True)
            counter += 1
            
            for exp in rep:
                new_df[exp] = new_df['labels']  
            new_df.drop(['labels'], axis=1, inplace=True)

        out_file = os.path.join(chromosome_output_folder, f"chromosome_{chromosome}_ec_hc_mask_xy.csv")
        new_df.to_csv(out_file, index=False)
        print(f"Saved HAT/LAT mask data for chromosome {chromosome} in {out_file}")
print("HAT and LAT gene computations completed and stored per chromosome.")

### Saving and PCA and SVD data

In [None]:
# Saving PCA of transcriptomic data.
df_PCA = df_PCA/count
df_PCA['experiments']=indices_names
df_PCA = df_PCA.set_index('experiments').T
df_PCA = df_PCA.rename_axis("chromosome_id").reset_index()
df_PCA.to_csv(os.path.join(output_folder,"fpkm_data_PCA.csv"),index=False)

In [None]:
# Saving SVD of transcriptomic data.
df_SVD = df_SVD/count
df_SVD['experiments']=indices_names
df_SVD = df_SVD.set_index('experiments').T
df_SVD = df_SVD.rename_axis("chromosome_id").reset_index()
df_SVD.to_csv(os.path.join(output_folder,"fpkm_data_SVD20.csv"),index=False)