# This is a script for calculating rank product
#### Rand product can be used to get the rank of each LR for each i.e. subtype, using multiple samples
#### Using MNL COEFFICIENT

In [31]:
"""Calculate rank product from scRNA-seq fold-change data."""
"""Implimented from Christie's script"""

import yaml
import sys
import pandas as pd
from scipy.stats.mstats import gmean
import os
import re

In [32]:
def rank_product(foldchange: pd.DataFrame, n_permutations: int = 100) -> pd.DataFrame:
    """Calculate rank product from differential expression fold change data.

    Args:
        foldchange (pd.DataFrame): Fold change values for every detected gene
        (columns) per sample (rows). 
        n_permutations (int, optional): Number of permutations with which to
        calculate significance levels. Defaults to 100.

    Returns:
        pd.DataFrame: Original fold change data, appended with additional
        columns for geometric mean of ranks (geo_mean), rank product value
        (rank), and permutation-based estimate of percentage of false positives
        for the calculated rank product value (pVal). Rows are sorted by
        ascending rank product. 
    """
    # Calculate the rank product of fold change values by ranking the geometric
    # mean of ranked values across samples.
    rank = foldchange.rank(axis=0, ascending=False)
    rank["geo_mean"] = gmean(rank, axis=1, nan_policy="omit")
    rank.sort_values("geo_mean", inplace=True)
    rank["rank"] = range(1, len(rank) + 1)
    # Calculate the significance for each rank product value by permuting the
    # fold change values randomly, calculating the rank product, and comparing
    # how likely the given value or better is observed.
    shuffle = rank.drop(["geo_mean", "rank"], axis=1).copy()
    c = pd.DataFrame(index=shuffle.index)
    for k in range(n_permutations):
        for col in shuffle.columns:
            shuffle[col] = shuffle[col].sample(frac=1).values
        geo_mean = gmean(shuffle, axis=1)
        c[k] = geo_mean <= rank["geo_mean"]
    erp = c.sum(axis=1) / n_permutations
    pfp = erp / rank["rank"]
    rank["pVal"] = pfp

    return rank



In [40]:
# Load MNL coefficient files
directory_path = '/Users/victoriagao/local_docs/NEST/stored_variables/MNL_subtype_coefficients/3.5/'
file_names = ['new_aggregated_coefficients_pvalues_for_BasalB.csv', 'new_aggregated_coefficients_pvalues_for_ClassicA.csv']
# file_path = [os.path.join(directory_path, file_name) for file_name in file_names]

data = {}
pattern = re.compile(r'for_(.*?)\.csv')

# Loop through each file, extract data, and populate the dictionary
for file_name in file_names:
    file_path = os.path.join(directory_path, file_name)
    df = pd.read_csv(file_path)[['ligand-receptor', 'Coefficient']]
    # Convert the DataFrame to a dictionary with 'ligand-receptor' as keys and 'coefficient' as values
    data_dict = pd.Series(df.Coefficient.values, index=df['ligand-receptor']).to_dict()
    
    label = pattern.search(file_name).group(1)
    # Add this dictionary to the main data dictionary, using a label for the file as the key
    data[label] = data_dict

df_final = pd.DataFrame.from_dict(data, orient='index')


In [41]:
df_final

Unnamed: 0,HLA-DRA,GSTP1-IL7R,PSAP-ERBB2,CALR-ITGA5,C3-LPAR2,APP-TNFRSF14,MIF-CD74,THBS1-LRP5,CELSR1-HLA,ANXA1-F2R,...,GDF15-ERBB2,HSP90B1-ERBB2,TNFSF12-TNFRSF12A,C3-FPR1,LGALS3-NPTN,FN1-IGF2R,PKD1-CFTR,GDF15-TGFBR2,APOE-SDC2,ITGB1-ITGA11
BasalB,0.308893,0.277297,0.268669,0.246829,0.234998,0.212181,0.207838,0.190971,0.189419,0.180496,...,-0.13409,-0.13622,-0.155665,-0.157654,-0.15911,-0.160355,-0.19903,-0.199778,-0.221654,-0.258492
ClassicA,-0.036986,-0.070131,-0.020469,0.010304,-0.201364,-0.117449,-0.202332,-0.045784,-0.142147,-0.026344,...,-0.046564,0.067896,0.191911,0.031672,0.003202,-0.003834,0.081375,0.063486,-0.016156,-0.119712


In [42]:
df_pivoted = df_final.transpose()
df_pivoted

Unnamed: 0,BasalB,ClassicA
HLA-DRA,0.308893,-0.036986
GSTP1-IL7R,0.277297,-0.070131
PSAP-ERBB2,0.268669,-0.020469
CALR-ITGA5,0.246829,0.010304
C3-LPAR2,0.234998,-0.201364
...,...,...
FN1-IGF2R,-0.160355,-0.003834
PKD1-CFTR,-0.199030,0.081375
GDF15-TGFBR2,-0.199778,0.063486
APOE-SDC2,-0.221654,-0.016156


In [39]:
def main():
    coefficient_df = df_pivoted
    rank_df = rank_product(coefficient_df)
    rank_df.to_csv(f"{directory_path}new_rank_product.csv")

main()

# if __name__ == '__main__':
#     sys.exit(main())