In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("GeneCN.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID','CellLineName','OncotreeLineage']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreeLineage")
# new_cols.insert(2, "CellLineName")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}
gene_df['Lineage as Number'] = gene_df['OncotreeLineage'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

In [4]:
X = gene_df.drop(columns = ['Lineage as Number', 'OncotreeLineage'])
y = gene_df['Lineage as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.5028571428571429


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.48857143 0.48571429 0.46857143 0.46285714 0.48857143]
0.4788571428571428


In [11]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(30):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:3]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

for category, top_5_features in coef_map.items():
    print(f"Top 3 features for {category}: {top_5_features}")

Top 5 features for Lung: ['WASIR2 (100132169)', 'EYA1 (2138)', 'CASC11 (100270680)']
Top 5 features for Head and Neck: ['MIR1268A (100302233)', 'CHEK2P2 (646096)', 'RNVU1-17 (101954269)']
Top 5 features for Ovary/Fallopian Tube: ['KLRC2 (3822)', 'ORM1 (5004)', 'UGT2B28 (54490)']
Top 5 features for Breast: ['FAM230C (26080)', 'OR2T10 (127069)', 'OR2T11 (127077)']
Top 5 features for Bone: ['CDKN2A (1029)', 'LINC01676 (101928476)', 'PRMT6 (55170)']
Top 5 features for Bowel: ['ACOT1 (641371)', 'UGT2B17 (7367)', 'CDKN2A (1029)']
Top 5 features for Lymphoid: ['AC244502.1 (105370401)', 'MIR4436A (100616399)', 'PRSS1 (5644)']
Top 5 features for Skin: ['ORM1 (5004)', 'ACOT1 (641371)', 'WASIR2 (100132169)']
Top 5 features for Uterus: ['CCL4L2 (9560)', 'CCL3L3 (414062)', 'MIR4275 (100422937)']
Top 5 features for Eye: ['HLA-DQB1 (3119)', 'HLA-DQB1-AS1 (106480429)', 'HLA-DQA1 (3117)']
Top 5 features for Esophagus/Stomach: ['CCL4L2 (9560)', 'CCL3L3 (414062)', 'UGT2B17 (7367)']
Top 5 features for Ple

In [13]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Cell Lineage'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Cell Lineage,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene
0,Lung,WASIR2 (100132169),EYA1 (2138),CASC11 (100270680)
1,Head and Neck,MIR1268A (100302233),CHEK2P2 (646096),RNVU1-17 (101954269)
2,Ovary/Fallopian Tube,KLRC2 (3822),ORM1 (5004),UGT2B28 (54490)
3,Breast,FAM230C (26080),OR2T10 (127069),OR2T11 (127077)
4,Bone,CDKN2A (1029),LINC01676 (101928476),PRMT6 (55170)
5,Bowel,ACOT1 (641371),UGT2B17 (7367),CDKN2A (1029)
6,Lymphoid,AC244502.1 (105370401),MIR4436A (100616399),PRSS1 (5644)
7,Skin,ORM1 (5004),ACOT1 (641371),WASIR2 (100132169)
8,Uterus,CCL4L2 (9560),CCL3L3 (414062),MIR4275 (100422937)
9,Eye,HLA-DQB1 (3119),HLA-DQB1-AS1 (106480429),HLA-DQA1 (3117)
