In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("GeneExpression.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID','CellLineName','OncotreeLineage']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreeLineage")
# new_cols.insert(2, "CellLineName")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}
gene_df['Lineage as Number'] = gene_df['OncotreeLineage'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

In [4]:
X = gene_df.drop(columns = ['Lineage as Number', 'OncotreeLineage'])
y = gene_df['Lineage as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.8445945945945946


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.78378378 0.80405405 0.81418919 0.75675676 0.79322034]
0.7904008245533668


In [8]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(32):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:3]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

for category, top_5_features in coef_map.items():
    print(f"Top 3 features for {category}: {top_5_features}")

Top 5 features for Lung: ['XAGE1A (653220)', 'XAGE1B (653067)', 'RNF212 (285498)']
Top 5 features for CNS/Brain: ['LTBR (4055)', 'GREM1 (26585)', 'CTSZ (1522)']
Top 5 features for Skin: ['UCHL1 (7345)', 'KRT19 (3880)', 'G0S2 (50486)']
Top 5 features for Biliary Tract: ['PXDN (7837)', 'LY6K (54742)', 'BMP4 (652)']
Top 5 features for Bladder/Urinary Tract: ['RPL39L (116832)', 'IFITM2 (10581)', 'HOXA9 (3205)']
Top 5 features for Bowel: ['HOXA9 (3205)', 'PITX2 (5308)', 'KRT7 (3855)']
Top 5 features for Lymphoid: ['CNN3 (1266)', 'LAPTM4B (55353)', 'MZB1 (51237)']
Top 5 features for Soft Tissue: ['COL6A3 (1293)', 'GREM1 (26585)', 'IFI27 (3429)']
Top 5 features for Ampulla of Vater: ['TSPAN8 (7103)', 'MMP7 (4316)', 'IFI27 (3429)']
Top 5 features for Uterus: ['TMEM101 (84336)', 'C2orf74 (339804)', 'HSPA1A (3303)']
Top 5 features for Kidney: ['IGFN1 (91156)', 'SPP1 (6696)', 'SERPINE1 (5054)']
Top 5 features for Pancreas: ['BTBD6 (90135)', 'UCHL1 (7345)', 'LDHB (3945)']
Top 5 features for Esopha

In [9]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Cell Lineage'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Cell Lineage,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene
0,Lung,XAGE1A (653220),XAGE1B (653067),RNF212 (285498)
1,CNS/Brain,LTBR (4055),GREM1 (26585),CTSZ (1522)
2,Skin,UCHL1 (7345),KRT19 (3880),G0S2 (50486)
3,Biliary Tract,PXDN (7837),LY6K (54742),BMP4 (652)
4,Bladder/Urinary Tract,RPL39L (116832),IFITM2 (10581),HOXA9 (3205)
5,Bowel,HOXA9 (3205),PITX2 (5308),KRT7 (3855)
6,Lymphoid,CNN3 (1266),LAPTM4B (55353),MZB1 (51237)
7,Soft Tissue,COL6A3 (1293),GREM1 (26585),IFI27 (3429)
8,Ampulla of Vater,TSPAN8 (7103),MMP7 (4316),IFI27 (3429)
9,Uterus,TMEM101 (84336),C2orf74 (339804),HSPA1A (3303)
