# Clean MNL script combining multiple PDAC samples
### Quantifies for the association between each LR and the four subtypes
### ***Need to make this script more modular


In [None]:
import numpy as np
import csv
import pickle
import matplotlib
import math
import pandas as pd
import matplotlib
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression

In [None]:
def readCsv(x):
  """Parse file."""
  #colNames = ["method", "benchmark", "start", "end", "time", "memory"]
  df = pd.read_csv(x, sep=",")

  return df

def preprocessDf(df):
  """Transform ligand and receptor columns."""
  df["ligand-receptor"] = df["ligand"] + '-' + df["receptor"]
  df["component"] = df["component"] #.astype(str).str.zfill(2)

  return df

In [None]:
# Load list of subtype label files
subtype_label_files = [
    '/Users/victoriagao/local_docs/schwartz_data/PDAC_64630_subtype.csv',
    '/Users/victoriagao/Documents/MSc/Schwartz_lab/experiment_data/Deisha/exp1_C1/cell_type_prediction.csv',
    '/Users/victoriagao/Documents/MSc/Schwartz_lab/experiment_data/Deisha/exp2_A1/cell_type_prediction.csv',
    '/Users/victoriagao/Documents/MSc/Schwartz_lab/experiment_data/Deisha/exp2_B1/cell_type_prediction.csv'
]

barcode_subtype_combined = {}

for file_index, subtype_label_file in enumerate(subtype_label_files, start=1):
    with open(subtype_label_file) as file:
        csv_file = csv.reader(file, delimiter=",")
        next(csv_file)  # Skip the header row if there is one
        for line in csv_file:
            # Add a file-unique identifier to the barcode before storing it in the dictionary
            unique_barcode = f"{line[0]}-{file_index}"
            barcode_subtype_combined[unique_barcode] = line[1]


# Load NEST output file into a 2D array
filenames = [
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp2_B1_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp2_A1_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp1_C1_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/NEST_combined_output_PDAC_64630.csv"
]

dfs = []
for file_index, filename in enumerate(filenames, start=1):
    df = pd.read_csv(filename, sep=",")
    
    # Add file-unique identifier to the barcodes in "from_cell" and "to_cell" columns
    if "from_cell" in df.columns:
        df["from_cell"] = df["from_cell"].astype(str) + f"-{file_index}"
    if "to_cell" in df.columns:
        df["to_cell"] = df["to_cell"].astype(str) + f"-{file_index}"
    
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

df_processed = preprocessDf(combined_df)


In [None]:
# filter out the LR that only appeared once
df_processed_filtered = df_processed[df_processed['ligand-receptor'].duplicated(keep=False)] 
df_processed_filtered

### Depends on the need for further eliminating LR pairs
lr_counts = df_processed_filtered['ligand-receptor'].value_counts()
threshold = lr_counts.quantile(0.90)  # gives the value at the 90th percentile
top_percent_lrs = lr_counts[lr_counts >= threshold].index
df_top_percent = df_processed_filtered[df_processed_filtered['ligand-receptor'].isin(top_percent_lrs)]

In [None]:
LR_df_filtered = df_top_percent.loc[df_top_percent["from_cell"].isin(barcode_subtype_combined.keys()) & df_top_percent["to_cell"].isin(barcode_subtype_combined.keys())]

### Make feature matrix

In [None]:
# Processing the dataframe to get the counts
df_long = pd.concat([LR_df_filtered[['from_cell', 'ligand-receptor']], LR_df_filtered[['to_cell', 'ligand-receptor']].rename(columns={'to_cell': 'from_cell'})])
df_counts = df_long.groupby(['from_cell', 'ligand-receptor']).size().unstack(fill_value=0)
df_counts = df_counts.rename_axis('spot_barcode', axis='index')


# Creating X matrix
X = df_counts

In [None]:
# Creating y vector
barcodes_in_X = df_counts.index.tolist()
y = pd.Series(barcodes_in_X).map(barcode_subtype_combined)

### Define model
### Fit model
### Calc p-value

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from scipy.stats import chi2

def fit_model(X, y):
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X, y)
    return model

def bootstrap_standard_errors(X, y, n_iterations, model):
    n_classes, n_features = model.coef_.shape
    bootstrap_coefs = np.zeros((n_iterations, n_classes, n_features))
    for i in range(n_iterations):
        try:
            X_sample, y_sample = resample(X, y)
            model.fit(X_sample, y_sample)
            if model.coef_.shape == (n_classes, n_features):
                bootstrap_coefs[i] = model.coef_
        except ValueError as e:
            continue
    return np.std(bootstrap_coefs, axis=0)

def calculate_p_values(weights, bootstrap_standard_errors):
    wald_stats = (weights / bootstrap_standard_errors) ** 2
    p_values = 1 - chi2.cdf(wald_stats, 1)
    return p_values

def calculate_confidence_intervals(bootstrap_coefs, confidence_level=0.95):
    # Calculate the lower and upper percentiles to establish the confidence interval
    lower_percentile = (1 - confidence_level) / 2.0 * 100
    upper_percentile = (1 + confidence_level) / 2.0 * 100
    confidence_intervals = np.percentile(bootstrap_coefs, [lower_percentile, upper_percentile], axis=0)
    return confidence_intervals



In [None]:
# Assuming X and y are defined
model = fit_model(X, y)
bootstrap_standard_errors = bootstrap_standard_errors(X, y, 1000, model)

feature_names = df_counts.columns
class_names = model.classes_
coefficients_dfs = {}

for index, class_name in enumerate(class_names):
    coefs = model.coef_[index]
    p_values = calculate_p_values(coefs, bootstrap_standard_errors[index])
    
    # Create a DataFrame for coefficients and p-values
    class_df = pd.DataFrame({
        "Coefficient": coefs,
        "p_Value": p_values
    }, index=feature_names)
    
    # Sort by the absolute values of the coefficients
    class_df = class_df.reindex(class_df.Coefficient.abs().sort_values(ascending=False).index)
    
    # Save the DataFrame
    coefficients_dfs[class_name] = class_df
    
    # Save to CSV
    class_df.to_csv(f"/Users/victoriagao/local_docs/NEST/stored_variables/MNL_subtype_coefficients/new_aggregated_coefficients_pvalues_for_{class_name}.csv")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt