# Clean MNL script combining multiple PDAC samples
### Quantifies for the association between each LR and the four subtypes


In [2]:
import numpy as np
import csv
import pickle
import matplotlib
import math
import pandas as pd
import matplotlib

In [3]:
def readCsv(x):
  """Parse file."""
  #colNames = ["method", "benchmark", "start", "end", "time", "memory"]
  df = pd.read_csv(x, sep=",")

  return df

def preprocessDf(df):
  """Transform ligand and receptor columns."""
  df["ligand-receptor"] = df["ligand"] + '-' + df["receptor"]
  df["component"] = df["component"] #.astype(str).str.zfill(2)

  return df

In [26]:
# Load gene_ids
gene_ids = []
with open("/Users/victoriagao/local_docs/NEST/stored_variables/gene_ids.txt", 'r') as file:
    for line in file:
        # Remove trailing newline characters and any leading/trailing whitespaces
        line = line.strip()
        gene_ids.append(line)

# Load coordinates
coordinates = np.load("/Users/victoriagao/local_docs/NEST/stored_variables/coordinates.npy")

# Load cell_barcode
with open('/Users/victoriagao/local_docs/NEST/stored_variables/cell_barcode.pkl', 'rb') as file:
    cell_barcode = pickle.load(file)

# # Load connected_components (assume already have it)
# # by subtype
# with open("/Users/victoriagao/local_docs/NEST/stored_variables/PDAC_connected_subtypes/PDAC_64630_filtered_connected_subtype_components.pkl", 'rb') as file:
#     filtered_connected_components = pickle.load(file)

# Load subtype label
subtype_label_file='/Users/victoriagao/local_docs/schwartz_data/PDAC_64630_subtype.csv'
subtype_label=[]
with open(subtype_label_file) as file:
    csv_file = csv.reader(file, delimiter=",")
    for line in csv_file:
        subtype_label.append(line)

barcode_subtype=dict()
for i in range(1,len(subtype_label)):
    barcode_subtype[subtype_label[i][0]]= subtype_label[i][1]

# Load NEST output file into a 2D array
filenames = [
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp2_B1_NEST_combined_rank_product_output_PDAC_130355_B1_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp2_A1_NEST_combined_rank_product_output_PDAC_130355_A1_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp1_C1_NEST_combined_rank_product_output_PDAC_140694_top20percent.csv",
    "/Users/victoriagao/local_docs/NEST/output/From_Fatema/NEST_combined_output_PDAC_64630.csv"
]

# Initialize an empty list to store DataFrames
dfs = []

# Loop through each file, read it into a DataFrame, and append to the list
for filename in filenames:
    df = pd.read_csv(filename, sep=",")
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
# csv_record_final = combined_df.values.tolist()
# df_column_names = list(combined_df.columns)
# csv_record_final = [df_column_names] + csv_record_final

# Preprocess NEST output df
df_processed = preprocessDf(combined_df)


In [27]:
combined_df

Unnamed: 0,from_cell,to_cell,ligand,receptor,attention_score,component,from_id,to_id,ligand-receptor
0,ATTATACTTTGCTCGT-1,TTAATCAGTACGTCAG-1,TGFB1,ITGB5,0.978623,-1,327,1279,TGFB1-ITGB5
1,ATTCAGGACCTATTTC-1,CTAGCCGATGTTATGA-1,TGFB1,ITGB6,0.978623,-1,330,608,TGFB1-ITGB6
2,ATTATGCCATAGGGAG-1,CTTGTACTTGTTGACT-1,TGFB1,ITGB5,0.978622,-1,328,673,TGFB1-ITGB5
3,GCGGGAACCAGGCCCT-1,CTTGTACTTGTTGACT-1,TGFB1,ITGB5,0.978622,-1,811,673,TGFB1-ITGB5
4,ACGCCAGATGATTTCT-1,TTAATCAGTACGTCAG-1,TGFB1,ITGB5,0.978622,-1,123,1279,TGFB1-ITGB5
...,...,...,...,...,...,...,...,...,...
1360309,AACGTCAGACTAGTGG-1,TTGGCTCGCATGAGAC-1,TGFB1,EGFR,0.839991,17,31,1389,TGFB1-EGFR
1360310,AGATTATAGGACGTTT-1,TTGTAATCCGTACTCG-1,TGFB1,ITGB5,0.829314,9,184,1394,TGFB1-ITGB5
1360311,AGATTATAGGACGTTT-1,TTGTAATCCGTACTCG-1,TGFB1,SDC2,0.855152,9,184,1394,TGFB1-SDC2
1360312,GAGAGGTGCATTCTGG-1,TTGTTTCCATACAACT-1,TGFB1,EGFR,0.830097,2,715,1404,TGFB1-EGFR


In [28]:
df_processed

Unnamed: 0,from_cell,to_cell,ligand,receptor,attention_score,component,from_id,to_id,ligand-receptor
0,ATTATACTTTGCTCGT-1,TTAATCAGTACGTCAG-1,TGFB1,ITGB5,0.978623,-1,327,1279,TGFB1-ITGB5
1,ATTCAGGACCTATTTC-1,CTAGCCGATGTTATGA-1,TGFB1,ITGB6,0.978623,-1,330,608,TGFB1-ITGB6
2,ATTATGCCATAGGGAG-1,CTTGTACTTGTTGACT-1,TGFB1,ITGB5,0.978622,-1,328,673,TGFB1-ITGB5
3,GCGGGAACCAGGCCCT-1,CTTGTACTTGTTGACT-1,TGFB1,ITGB5,0.978622,-1,811,673,TGFB1-ITGB5
4,ACGCCAGATGATTTCT-1,TTAATCAGTACGTCAG-1,TGFB1,ITGB5,0.978622,-1,123,1279,TGFB1-ITGB5
...,...,...,...,...,...,...,...,...,...
1360309,AACGTCAGACTAGTGG-1,TTGGCTCGCATGAGAC-1,TGFB1,EGFR,0.839991,17,31,1389,TGFB1-EGFR
1360310,AGATTATAGGACGTTT-1,TTGTAATCCGTACTCG-1,TGFB1,ITGB5,0.829314,9,184,1394,TGFB1-ITGB5
1360311,AGATTATAGGACGTTT-1,TTGTAATCCGTACTCG-1,TGFB1,SDC2,0.855152,9,184,1394,TGFB1-SDC2
1360312,GAGAGGTGCATTCTGG-1,TTGTTTCCATACAACT-1,TGFB1,EGFR,0.830097,2,715,1404,TGFB1-EGFR


### Make count matrix

In [29]:
# Processing the dataframe to get the counts
df_long = pd.concat([df_processed[['from_cell', 'ligand-receptor']], df_processed[['to_cell', 'ligand-receptor']].rename(columns={'to_cell': 'from_cell'})])
df_counts = df_long.groupby(['from_cell', 'ligand-receptor']).size().unstack(fill_value=0)

# Creating X matrix
X = df_counts

In [30]:
X

ligand-receptor,A2M-LRP1,A2M-TNFRSF14,A2M-TYRO3,ADAM12-ITGA9,ADAM12-SDC4,ADAM15-ITGA5,ADAM15-ITGA9,ADAM17-GHR,ADAM17-IL1R2,ADAM17-IL6R,...,XCL2-TACR2,XCL2-TBXA2R,XCL2-XCR1,YARS-EGFR,YARS-RPSA,YARS-SORT1,YARS-VCAM1,ZP3-MERTK,ZP3-PCDH17,no-ligand-no-receptor
from_cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACAAGTATCTCCCA-1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACACCAATAACTGC-1,0,0,0,4,0,0,4,0,0,0,...,0,0,0,0,4,0,0,0,0,0
AAACAGGGTCTATATT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATTTCCCGGATT-1,3,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCGGGTAGGTACC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTCAGTGTGCTAC-1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTGTTGTGTGTCAAGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
TTGTTTCACATCCAGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTGTTTCATTAGTCTA-1,3,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Just testing random things out

In [31]:
# df = readCsv("/Users/victoriagao/local_docs/NEST/output/From_Fatema/exp1_C1_NEST_combined_rank_product_output_PDAC_140694_top20percent.csv")
# df_processed = preprocessDf(df)

all_values = pd.concat([df_processed['from_cell'], df_processed['to_cell']])
num_unique_values = all_values.nunique()
num_unique_values

2624