# This script fits a linear regression for each LR pair and combine to a big table

In [20]:
import numpy as np
import csv
import pickle
import matplotlib
import math
import pandas as pd
import matplotlib
from sklearn.utils import resample
from sklearn import linear_model
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import Counter
from scipy.stats import chi2
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler


In [3]:
def readCsv(x):
  """Parse file."""
  #colNames = ["method", "benchmark", "start", "end", "time", "memory"]
  df = pd.read_csv(x, sep=",")

  return df

def preprocessDf(df):
  """Transform ligand and receptor columns."""
  df["ligand-receptor"] = df["ligand"] + '-' + df["receptor"]
  df["component"] = df["component"] #.astype(str).str.zfill(2)

  return df

In [4]:
# Load subtype label
# subtype_label_file='/Users/victoriagao/local_docs/schwartz_data/PDAC_64630_subtype.csv'
subtype_label_file='/Users/victoriagao/Documents/MSc/Schwartz_lab/experiment_data/Deisha/PDAC_64630/fractional_abundances_by_spot.csv'
subtype_abundance_df = readCsv(subtype_label_file)
# subtype_label=[]
# with open(subtype_label_file) as file:
#     csv_file = csv.reader(file, delimiter=",")
#     for line in csv_file:
#         subtype_label.append(line)

# barcode_subtype=dict()
# for i in range(1,len(subtype_label)):
#     barcode_subtype[subtype_label[i][0]]= subtype_label[i][1]

# Load NEST output 
df = readCsv("/Users/victoriagao/local_docs/NEST/output/From_Fatema/NEST_combined_output_PDAC_64630.csv")
output_processed = preprocessDf(df)

### Build feature matrix

In [5]:
### Merge NEST output with subtype label, and filter out the spots that are not in the subtype label
matched_spots_df = pd.merge(output_processed, subtype_abundance_df, left_on='from_cell', right_on='SpotID') # Change from_cell to to_cell if interested in the receptors

In [6]:
# filter out the LR that only appeared once
matched_spots_df = matched_spots_df[matched_spots_df['ligand-receptor'].duplicated(keep=False)] 
# Take only top 90% LR by frequency
lr_counts = matched_spots_df['ligand-receptor'].value_counts()
threshold = lr_counts.quantile(0.50)  # gives the value at the 50th percentile
top_percent_lrs = lr_counts[lr_counts >= threshold].index
matched_spots_df = matched_spots_df[matched_spots_df['ligand-receptor'].isin(top_percent_lrs)]
# Delete some columns
matched_spots_df = matched_spots_df.drop(columns=['to_cell', 'ligand', 'receptor', 'attention_score', 'component', 'from_id','to_id','SpotID'])

In [7]:
matched_spots_df

Unnamed: 0,from_cell,ligand-receptor,Endothelial,ClassicA,Fibroblast,Duct,Endocrine,Mixed,BasalB,NKCell,Acinar,BasalA,BPlasmaCell,ClassicB,TCell,Macrophage,Mast
0,ACGCGCTACACAGGGT-1,LGALS3-NPTN,0.000000,0.00,0.000000,0.0,0.0,0.200000,0.000000,0.00,0.0,0.0,0.20,0.0,0.0,0.600000,0.0
1,TCTTACTTATGCCTCT-1,FN1-RPSA,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,1.00,0.0,0.0,0.000000,0.0
2,AGTCTCACAAGACTAC-1,PTPRF-RACK1,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,1.00,0.0,0.0,0.000000,0.0
3,GTGGGCTTAGACACAC-1,FN1-RPSA,0.000000,0.00,1.000000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.00,0.0,0.0,0.000000,0.0
4,CCACAGTACCCATCCT-1,FN1-RPSA,0.000000,0.25,0.000000,0.0,0.0,0.000000,0.000000,0.25,0.0,0.0,0.50,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,AAGAGGATGTACGCGA-1,LAMC2-ITGA6,0.000000,0.00,0.000000,0.0,0.0,0.285714,0.571429,0.00,0.0,0.0,0.00,0.0,0.0,0.142857,0.0
1087,AACGTCAGACTAGTGG-1,TGFB1-ITGB5,0.000000,0.00,0.750000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.25,0.0,0.0,0.000000,0.0
1088,AACGTCAGACTAGTGG-1,TGFB1-EGFR,0.000000,0.00,0.750000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.25,0.0,0.0,0.000000,0.0
1089,AGATTATAGGACGTTT-1,TGFB1-ITGB5,0.000000,0.00,0.200000,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.00,0.0,0.6,0.000000,0.2


In [8]:
len(matched_spots_df['ligand-receptor'].unique())


55

### Fit logistic regression and outputs a big coefficient table for all regressions

In [9]:
unique_lr_pairs = matched_spots_df['ligand-receptor'].unique() # Get unique ligand-receptor pairs

##### Run this if want to fit without splitting data

In [10]:
results = []

# Iterate through each unique ligand-receptor pair
for lr_pair in unique_lr_pairs:
    # Prepare the feature matrix X and the target vector y
    # get all the columns from matched_spots_df except for the 'from_cell', 'edge_rank' and 'ligand-receptor'
    X_log_reg = matched_spots_df.drop(columns=["from_cell", "edge_rank", "ligand-receptor"])
    y_binary = ["yes" if lr == lr_pair else "no" for lr in matched_spots_df["ligand-receptor"]]
    # print dimensions of X and y
    # print(X_log_reg.shape, len(y_binary))

    # print(Counter(y_binary))
    
    # Build and fit the logistic model
    model_log_reg = linear_model.LogisticRegression(solver='lbfgs')
    model_log_reg.fit(X_log_reg, y_binary)
    
    # Extract coefficients and score
    coef = model_log_reg.coef_[0]  # Coefficients for the features
    score = model_log_reg.score(X_log_reg, y_binary)  # Accuracy score for how the model is fitted
    
    # Append the results (including LR pair, coefficients, and score) to our results list
    results.append([lr_pair] + list(coef) + [score])

# Define the column names for our results DataFrame
columns = ['Ligand-Receptor'] + X_log_reg.columns.tolist() + ['Accuracy Score']

: 

##### Run this if want to split data before fitting

In [43]:
results = []

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)

# Iterate through each unique ligand-receptor pair
for lr_pair in unique_lr_pairs:
    # Prepare the feature matrix X and the target vector y
    # get all the columns from matched_spots_df except for the 'from_cell', 'edge_rank' and 'ligand-receptor'
    X_log_reg = matched_spots_df.drop(columns=["from_cell", "ligand-receptor"])
    y_binary = ["yes" if lr == lr_pair else "no" for lr in matched_spots_df["ligand-receptor"]]

    # Randomly undersample the majority class
    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X_log_reg, y_binary)
    # print(f"original x_log_reg: {X_log_reg.iloc[np.array(y_binary) == 'yes', :]}")
    # print(f"resampled x_log_reg: {X_resampled.iloc[np.array(y_resampled) == 'yes', :]}")
    # print(Counter(y_resampled))
    
    # Extract coefficients and score
    coef = model_log_reg.coef_[0]  # Coefficients for the features

    # Calculate accuracy score
    n_scores = cross_val_score(model_log_reg, X_resampled, y_resampled, scoring='accuracy', cv=cv, n_jobs=-1)
    score = np.mean(n_scores)

    # Calculate sensitivity
    y_pred = model_log_reg.predict(X_resampled)
    tn, fp, fn, tp = confusion_matrix(y_resampled, y_pred).ravel()
    sensitivity = tp / (tp + fn)

    # Use f string to print the sensitivity
    # print(f"Sensitivity for {lr_pair} :{sensitivity}")
    # print(classification_report(y_test, y_pred))
    
    # Append the results (including LR pair, coefficients, and score) to our results list
    results.append([lr_pair] + list(coef) + [score] + [sensitivity])

# Define the column names for our results DataFrame
columns = ['Ligand-Receptor'] + X_log_reg.columns.tolist() + ['Accuracy Score'] + ['Sensitivity']

In [44]:
# Create a DataFrame from our results
results_df = pd.DataFrame(results, columns=columns)
results_df

Unnamed: 0,Ligand-Receptor,Endothelial,ClassicA,Fibroblast,Duct,Endocrine,Mixed,BasalB,NKCell,Acinar,BasalA,BPlasmaCell,ClassicB,TCell,Macrophage,Mast,Accuracy Score,Sensitivity
0,LGALS3-NPTN,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.558929,0.166667
1,FN1-RPSA,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.743162,0.0
2,PTPRF-RACK1,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.711429,0.134615
3,INS-HLA,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.55,0.0
4,TIMP1-LRP1,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.722727,0.037037
5,THBS1-SDC1,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.742727,0.0
6,MIF-CD74,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.8,0.0
7,LGALS3-ITGB4,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.59305,0.329897
8,RPS19-RPSA,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.74375,0.05
9,PTPRF-TGFBR2,0.0,0.871881,-0.346872,-0.305838,-0.175122,0.07996,0.41129,0.0,0.0,0.0,-0.490652,0.0,0.0,0.0,-0.043691,0.508333,0.666667


In [45]:
# Save the results to a CSV file
results_df.to_csv("/Users/victoriagao/local_docs/NEST/stored_variables/Celltype_LR_invidual_LogisticRegressions/logistic_regression_results_undersampled.csv", index=False)


#### Under construction: try modularizing my code

In [14]:
# Define functions for bootstrapping and calculating p-values

def fit_model(X, y):
    model = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X, y)
    return model

def bootstrap_coefficients(X, y, n_iterations, model):
    n_classes, n_features = model.coef_.shape
    bootstrap_coefs = np.zeros((n_iterations, n_classes, n_features))
    for i in range(n_iterations):
        try:
            X_sample, y_sample = resample(X, y)
            model.fit(X_sample, y_sample)
            if model.coef_.shape == (n_classes, n_features):
                bootstrap_coefs[i] = model.coef_
        except ValueError as e:
            continue
    return np.std(bootstrap_coefs, axis=0)

def calculate_p_values(weights, bootstrap_standard_errors):
    wald_stats = (weights / bootstrap_standard_errors) ** 2
    p_values = 1 - chi2.cdf(wald_stats, 1)
    return p_values


In [17]:
def analyze_lr_pairs(matched_spots_df, n_iterations=100):
    results = []
    unique_lr_pairs = matched_spots_df['ligand-receptor'].unique()

    for lr_pair in unique_lr_pairs:
        X = matched_spots_df[['Endothelial', 'ClassicA', 'Fibroblast', 'Duct', 'Endocrine', 'Mixed', 'BasalB', 'NKCell', 'Acinar', 'BasalA', 'BPlasmaCell', 'ClassicB', 'TCell', 'Macrophage', 'Mast']]
        y = ["yes" if lr == lr_pair else "no" for lr in matched_spots_df["ligand-receptor"]]

        model = fit_model(X, y)
        bootstrap_se = bootstrap_coefficients(X, y, n_iterations, model)
        for index, coefs in enumerate(model.coef_):
            p_values = calculate_p_values(coefs, bootstrap_se[index])
            score = model.score(X, y)
            results.append([lr_pair] + list(coefs) + [score] + list(p_values))

    columns = ['Ligand-Receptor'] + X.columns.tolist() + ['Accuracy Score'] + [f'P-value {col}' for col in X.columns]
    results_df = pd.DataFrame(results, columns=columns)
    return results_df


In [18]:
results_df_new = analyze_lr_pairs(matched_spots_df)

In [19]:
results_df_new

Unnamed: 0,Ligand-Receptor,Endothelial,ClassicA,Fibroblast,Duct,Endocrine,Mixed,BasalB,NKCell,Acinar,...,P-value Mixed,P-value BasalB,P-value NKCell,P-value Acinar,P-value BasalA,P-value BPlasmaCell,P-value ClassicB,P-value TCell,P-value Macrophage,P-value Mast
0,FN1-RPSA,-0.246528,-0.9187,0.949023,-0.823555,-0.676334,-0.361244,-0.529071,1.159686,-0.076409,...,0.131103,0.01226625,0.001893,0.108065,3.625093e-08,4e-05,0.173974,0.859568,0.742691,0.01971583
1,PTPRF-RACK1,-0.137359,-0.239398,-0.325285,0.994843,-0.6637,-1.105384,0.843249,-0.197224,-0.064456,...,9.193202e-12,0.003836668,0.297803,0.282981,8.375497e-05,0.01177,1.0,0.657815,4e-06,6.466331e-06
2,TIMP1-LRP1,-0.180386,0.094087,0.981884,0.255891,0.575596,-0.34013,-0.327194,-0.063142,-0.034608,...,0.000276397,0.0008411175,0.075084,0.228865,0.007928784,0.506251,1.0,0.001229,0.787127,0.4445038
3,THBS1-SDC1,0.439351,-0.767361,0.820315,-0.35928,-0.633143,-0.408185,-0.452881,0.847279,-0.022491,...,1.051594e-05,0.09457226,0.044472,0.966669,0.005883576,0.152845,1.0,0.435368,0.304081,0.5772902
4,LGALS3-ITGB4,-0.320351,0.624395,-0.047879,-0.983041,0.859368,0.478007,0.47692,-0.001141,-0.112969,...,0.09123632,0.02148718,0.995757,0.700593,0.0001401777,0.290858,0.022763,0.776207,0.550541,0.4548368
5,RPS19-RPSA,-0.197531,-0.270036,-0.001666,-0.726819,1.168674,-0.860237,-0.197246,0.140034,-0.182711,...,6.445898e-09,0.4501153,0.616239,0.588677,0.0008058446,0.001133,1.0,0.165326,0.183259,0.1039829
6,TGFB1-ITGB5,0.675108,-1.270283,0.53246,-0.697627,0.558183,-0.360938,-0.538984,0.047897,-0.069051,...,0.1052321,0.008819415,0.858358,0.46298,5.741115e-05,0.045371,1.0,0.084593,0.749428,0.113163
7,PTPRF-MET,0.19091,0.900476,-1.071002,0.34785,-0.195013,0.445473,-0.935548,-0.173254,-0.144701,...,0.1712065,0.001277639,0.000161,0.004669,0.4127599,0.000943,0.002392,0.121672,0.205158,5.737387e-09
8,PLXNB2-MET,-0.090755,1.296295,-0.495131,1.031294,-0.807282,1.137403,-0.124857,0.008527,-0.03018,...,2.016828e-05,0.5989457,0.950064,0.520837,0.3443264,0.0,0.019516,0.0,0.008194,0.7071686
9,LAMC2-ITGB4,0.724975,-0.78355,-0.698463,-0.199247,0.233563,0.38891,1.305833,-0.194121,-0.224106,...,0.1600025,2.063507e-07,0.002455,0.001147,1.495776e-06,0.147222,0.219109,0.000968,0.258988,0.09974174


In [21]:
results_df_new.to_csv(f"/Users/victoriagao/local_docs/NEST/stored_variables/LR_invidual_LogisticRegressions/big_regression_table.csv")