In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
%config InlineBackend.figure_format = 'retina'

In [None]:
# read in latest data
# the ZL updated metadata with IHC already excluded
df = pd.read_csv("../input-data/SA/data_updated230524_new_excludedIHC.tsv",sep="\t")
print(df.shape)

#print row 107 and 226
df.iloc[[107,226], 1:50]


In [None]:
# exclude the 29 Cibersort scores, leaving only 3
dfd = df.drop(columns=['Bindea_full', 'Expanded_IFNg', 
        'C_Bcellsmemory','C_Plasmacells','C_TcellsCD8','C_TcellsCD4naive',
         'C_TcellsCD4memoryactivated','C_Tcellsfollicularhelper',
         'C_Tcellsregulatory(Tregs)','C_Tcellsgammadelta','C_NKcellsresting',
         'C_NKcellsactivated', 'C_Monocytes', 'C_MacrophagesM0',
         'C_MacrophagesM1','C_Dendriticcellsresting',
         'C_Dendriticcellsactivated', 'C_Mastcellsresting',
         'C_Mastcellsactivated','C_Eosinophils', 'C_Neutrophils', 'S_PAM100HRD'])

print(dfd.shape)
dfd.head()

In [None]:
# subset df into just TotalNeo_Count (as X variables) and the immune scores as Y variables
dfd_x = dfd.drop(columns = ['PAM50', 'Subtype', 'HR_status',	'HER_status', 'Age', 'AgeGroup', 'Stage', 'TumorGrade', 'TumourSize', 'FusionNeo_Count', 'FusionNeo_bestScore','FusionTransscript_Count', 'Fusion_T2NeoRate', 'SNVindelNeo_Count', 'SNVindelNeo_IC50', 'SNVindelNeo_IC50Percentile'])

print(dfd_x.shape)
dfd_x.describe()

In [None]:
# counts do not tally so there has to be NaNs
dfd_x.isnull().sum()

In [None]:
# let's drop all NaN for now
dfd_xc = dfd_x.dropna()

print(dfd_xc.shape)
print(dfd_xc.isnull().sum())
dfd_xc.head()

In [None]:
# set ID column as index and subset the rest of the columns (from col 1 to 5)
dfd_ss = dfd_xc.set_index('ID')
ss_cols = list(dfd_ss.columns)
print(ss_cols)
print(len(ss_cols))
dfd_ss.head()

Let's programmatically subset of the huge dataframe into subsets of columns so we can plot a scatter plot matrix manageably. Define the function.

In [None]:
import typing
def subset_df_by_columns(df: pd.DataFrame, num_subsets: int, x_variable: str) -> typing.DefaultDict:
    """Subset a DataFrame into approximately equal groups of columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        num_subsets (int): The desired number of subsets.
        x_variable (str): The name of the column to use as the x-axis variable.

    Returns:
        typing.DefaultDict: A dictionary containing the subsets, where the keys are the subset indices
    """
    if df.columns[0] != 'Batch' and df.columns[1] != x_variable:
        raise ValueError(f"The first two columns of the DataFrame must be 'Batch' and the specified X variable ({x_variable}).")

    # slice the dataframe
    df_x = df.iloc[:, :2]
    df_y = df.iloc[:, 2:]
    # Get the number of columns in the remaining DataFrame
    num_col_Y = len(df_y.columns)

    # Calculate the number of columns per subset in the Y var df
    cols_per_subset, remainder = divmod(num_col_Y, num_subsets)

    # Create a list of column indices for each subset
    col_indices = []
    start = 0
    for i in range(num_subsets):
        end = start + cols_per_subset
        if i < remainder:
            end += 1
        col_indices.append(list(range(start, end)))
        start = end

    # Subset the DataFrame based on the column indices
    subsets = {i: df_y.iloc[:, indices] for i, indices in enumerate(col_indices)}

    # map a concat operation for all the dfs in the dict
    for key, value in subsets.items():
        subsets[key] = pd.concat([df_x, value], axis=1)

    return subsets

In [None]:
num_ss = 12
print(len(ss_dict := subset_df_by_columns(dfd_ss, num_ss, 'TotalNeo_Count')))
ss_dict[10]

In [None]:
# X variable TotalNeo_Count should be transformed due to massive outliers
# Apply log transformation as a map to the dictionary of dfs

##### DEPRECATED ######
# ss_logtrans_dict = {
#     # apply log transform just on the TotalNeo_Count
#     # key: df.assign(TotalNeo_Count=lambda x: np.log1p(x['TotalNeo_Count']))
#     # log transform all instead of just Total Neo Count
#     key: df[['Batch']].join(df.drop('Batch', axis=1).apply(np.log1p))
#     for key, df in ss_dict.items()
# }

# IMPRES column is a discrete score so it does not make sense to have it log-transformed. Redo

ss_logtrans_dict = {}

for key, df in ss_dict.items():
    if key == 0:
        ss_logtrans_dict[key] = df[['Batch', 'IMPRES']].join(df.drop(['Batch', 'IMPRES'], axis=1).apply(np.log1p))
        # switch the position of IMPRES with TotalNeo_Count columns with each other
        col_tokeep = [col for col in ss_logtrans_dict[key].columns if col not in ['Batch', 'TotalNeo_Count', 'IMPRES']]
        new_order = ['Batch', 'TotalNeo_Count', 'IMPRES'] + col_tokeep
        ss_logtrans_dict[key] = ss_logtrans_dict[key][new_order]
    else:
        ss_logtrans_dict[key] = df[['Batch']].join(df.drop('Batch', axis=1).apply(np.log1p))

In [None]:
ss_dict[0]

In [None]:
ss_logtrans_dict[0]

In [None]:
# plot untransformed subset 
# test a subset df
set_num = 0
pp = sns.pairplot(ss_dict[set_num], hue='Batch', diag_kind="kde", kind='reg', corner=True, plot_kws={'scatter_kws': {'alpha': 0.5, 's': 10}}, palette='Set1')
# add plot title
plt.suptitle(f'Raw Total Neoantigen Count vs Immune Features (Set No. {set_num})', fontsize=28, fontweight='medium')

# Iterate through the axes and set bold titles
for i, ax in enumerate(pp.axes.flat):
    if ax is not None:
        if ax.get_xlabel() == "TotalNeo_Count":
            ax.set_xlabel(ax.get_xlabel(), fontweight='bold', fontsize=12, color='red')
        else:
            ax.set_xlabel(ax.get_xlabel(), fontweight='bold', fontsize=12)
        
        # Handle y-axis labels (only for the leftmost column)
        if i % pp.axes.shape[1] == 0:  # Check if it's the first column
            if ax.get_ylabel() == "TotalNeo_Count":
                ax.set_ylabel(ax.get_ylabel(), fontweight='bold', fontsize=12, color='red')
            else:
                ax.set_ylabel(ax.get_ylabel(), fontweight='bold', fontsize=12)


plt.show()

In [None]:
# replot on the log-transformed data of all columns except IMPRES
# test a subset df
set_num = 0
pp = sns.pairplot(ss_logtrans_dict[0], hue='Batch', diag_kind="kde", kind='reg', corner=True, plot_kws={'scatter_kws': {'alpha': 0.5, 's': 10}}, palette='Set1')
# add plot title
plt.suptitle(f'Log-Transformed Total Neoantigen Count vs Immune Features (Set No. {set_num})', fontsize=28, fontweight='medium')

# Iterate through the axes and set bold titles
for i, ax in enumerate(pp.axes.flat):
    if ax is not None:
        if ax.get_xlabel() == "TotalNeo_Count":
            ax.set_xlabel(ax.get_xlabel(), fontweight='bold', fontsize=12, color='red')
        else:
            ax.set_xlabel(ax.get_xlabel(), fontweight='bold', fontsize=12)
        
        # Handle y-axis labels (only for the leftmost column)
        if i % pp.axes.shape[1] == 0:  # Check if it's the first column
            if ax.get_ylabel() == "TotalNeo_Count":
                ax.set_ylabel(ax.get_ylabel(), fontweight='bold', fontsize=12, color='red')
            else:
                ax.set_ylabel(ax.get_ylabel(), fontweight='bold', fontsize=12)


plt.show()

In [None]:
from contextlib import contextmanager
import gc

@contextmanager
def plot_and_save(output_path, naming_var):
    try:
        yield
    finally:
        plt.savefig(f'{output_path}/{naming_var}.pdf', dpi=300)
        plt.close()
        gc.collect()

def process_pairplots(df: pd.DataFrame, output_path: str, naming_var: str, hue_col: str = None):
    """
    Generates Seaborn pairplots for the given DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        output_path (str): The path to save the plots.
        naming_var (str): The variable name string to use for naming the output file.
        hue_col (str, optional): The column name string to use for stratifying the scatter plot points.

    Example:
        process_pairplots(df, 'output_path', 'naming_var', 'hue_col')

    Returns:
        None
    """
    with plot_and_save(output_path, naming_var):
        sns.pairplot(df, hue=hue_col, diag_kind="kde", kind='reg', corner=True, plot_kws={'scatter_kws': {'alpha': 0.5, 's': 10}}, palette='Set1')
        # add plot title
        plt.suptitle(f'Log-Transformed Total Neoantigen Count vs Immune Features ({naming_var})', fontsize=20, fontweight=600)

## Visualizing Distributions and Transforming Data

In [None]:
# first rearrange the columns 'IMPRES' with 'ESTIMATE'
# Get the list of column names
cols_arr = dfd_ss.columns.tolist()

# Find the indices of the columns you want to swap
indx_A = cols_arr.index('IMPRES')
indx_B = cols_arr.index('ESTIMATE')

# Swap the positions
cols_arr[indx_A], cols_arr[indx_B] = cols_arr[indx_B], cols_arr[indx_A]

# Reindex the DataFrame with the new column order
dfda_full = dfd_ss[cols_arr]
dfda_full


In [None]:
# visualise distributions

# Determine the number of rows and columns for the subplot grid
nrows = 10
ncols = 12

#define the plotting function
def visualise_distribution(df):
    # Create a figure and a grid of subplots
    # Flatten the axes array for easy iteration
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(30, 22))
    axes = axes.flatten()

    # Plot histograms for each column
    for i, column in enumerate(df.columns):
        sns.histplot(df[column], kde=False, ax=axes[i], color='green')
        axes[i].set_title(column)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')

    # Hide any remaining empty subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust the layout
    plt.tight_layout()
    plt.subplots_adjust(top=0.92) 
    plt.suptitle(f'Original Distribution of the Dataset', fontsize=20, fontweight='bold')
    # plt.savefig(f'Distribution_before.png',dpi=300)
    plt.show()

In [None]:

#execute the function
visualise_distribution(dfda_full)

### Yeo-Johnson Transformation
Yeo-Johnson transformation is an extension of the Box-Cox transformation, which falls under the power transformation family. YJ algorithm is designed to handle both positive and negative values in the dataset. Similar to Box-Cox, the Yeo-Johnson transformation aims to stabilize variance, make the data more symmetric, and bring it closer to a normal distribution.

In [None]:
#YJ transformation
# from scipy.stats import yeojohnson
# yj = lambda x: yeojohnson(x)[0]
# df=df.apply(yj)

# zscore = lambda x : stats.zscore(x)
# df_input_yjz_b1 = df_input_yj_b1.apply(zscore)
# df_input_yjz_b2 = df_input_yj_b2.apply(zscore)

In [None]:
# from scipy.stats import yeojohnson
# import pandas as pd
# import numpy as np

# # Assuming df is your DataFrame
# # and numeric_columns is a list of column names you want to transform

# def yj_transform(data):
#     transformed, lambda_param = yeojohnson(data)
#     return pd.Series(transformed, index=data.index, name=data.name)

# # Apply the transformation only to specified numeric columns
# df[numeric_columns] = df[numeric_columns].apply(yj_transform)


In [None]:
# # replot heatmap on all_log-transformed data
# corr_df_test = ss_logtrans_dict[0].drop(columns='Batch').corr(method='spearman')
# corr_df_test = corr_df_test.round(2)

# # Create a mask for the upper triangle
# mask = np.triu(np.ones_like(corr_df_test, dtype=bool))

# plt.figure(figsize=(12, 10))
# # Create the correlation matrix and represent it as a heatmap.
# hm = sns.heatmap(corr_df_test, annot = True, cmap = 'coolwarm', square = True, linewidths=0.5, mask=mask, cbar_kws={"shrink": .5})

# # Get current labels
# ylabels = hm.get_yticklabels()
# xlabels = hm.get_xticklabels()

# # Hide the first y-axis label and the last x-axis label
# ylabels[0].set_visible(False)
# xlabels[-1].set_visible(False)

# # Rotate and align the tick labels
# plt.setp(xlabels, rotation=45, ha='right')

# # Change color of specific x-axis label
# for label in xlabels:
#     if label.get_text() == "TotalNeo_Count":
#         label.set_color('red')  # Change color to red
#         label.set_fontweight('bold')

# # Removes all ticks
# hm.tick_params(left=False, bottom=False)

# hm.set_title('Dataframe #', fontsize=14, x=0.4)

# plt.show()

In [None]:
# # define a heatmap plot function
# from contextlib import contextmanager
# import gc

# @contextmanager
# def plot_and_save(output_path, naming_var):
#     try:
#         yield
#     finally:
#         plt.savefig(f'{output_path}/{naming_var}.pdf', dpi=300)
#         plt.close()
#         gc.collect()

# def process_heatmaps(df: pd.DataFrame, output_path: str, naming_var: str):
#     with plot_and_save(output_path, naming_var):
#         corr_df_test = df.drop(columns='Batch').corr(method='spearman')

#         # Create a mask for the upper triangle
#         mask = np.triu(np.ones_like(corr_df_test, dtype=bool))

#         # Create the correlation matrix and represent it as a heatmap.
#         hm = sns.heatmap(corr_df_test, annot = True, cmap = 'coolwarm', square = True, linewidths=0.5, mask=mask, cbar_kws={"shrink": .5})

#         # Get current labels
#         ylabels = hm.get_yticklabels()
#         xlabels = hm.get_xticklabels()

#         # Hide the first y-axis label and the last x-axis label
#         ylabels[0].set_visible(False)
#         xlabels[-1].set_visible(False)

#         # Removes all ticks
#         hm.tick_params(left=False, bottom=False)

#         hm.set_title(f'{naming_var}', fontsize=14, x=0.4)

### PCA for Batch Effect Assesment

Run PCA to test for potential batch effects in the data.


In [None]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA


# # Assuming 'Batch' is your batch column and the rest are features
# features = [col for col in dfd_ss.columns if col != 'Batch']
# X = dfd_ss[features]
# batch = dfd_ss['Batch']

# # Standardize the features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Perform PCA
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(X_scaled)

# # Create a DataFrame with PCA results
# pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
# pca_df['Batch'] = batch.values

# # Plot
# plt.figure(figsize=(10, 8))
# sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Batch', palette='deep')
# plt.title('PCA of Dataset Colored by Batch')
# plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} explained variance)')
# plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} explained variance)')

# # Add a legend title
# plt.legend(title='Batch')

# plt.tight_layout()
# plt.show()