In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("CRISPRGeneDependency.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID','CellLineName','OncotreeLineage']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreeLineage")
# new_cols.insert(2, "CellLineName")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}
gene_df['Lineage as Number'] = gene_df['OncotreeLineage'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

In [4]:
X = gene_df.drop(columns = ['Lineage as Number', 'OncotreeLineage'])
y = gene_df['Lineage as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.6636363636363637


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.66818182 0.59090909 0.65909091 0.65454545 0.70454545]
0.6554545454545455


In [8]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(27):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:3]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

for category, top_5_features in coef_map.items():
    print(f"Top 3 features for {category}: {top_5_features}")

Top 3 features for Ovary/Fallopian Tube: ['PAX8 (7849)', 'STRAP (11171)', 'STRIP1 (85369)']
Top 3 features for Myeloid: ['MYB (4602)', 'CBFB (865)', 'LMO2 (4005)']
Top 3 features for Bowel: ['CTNNB1 (1499)', 'TCF7L2 (6934)', 'FERMT2 (10979)']
Top 3 features for Bladder/Urinary Tract: ['FAM50A (9130)', 'PPARG (5468)', 'SOX4 (6659)']
Top 3 features for Lung: ['NFE2L2 (4780)', 'KNTC1 (9735)', 'RPP25L (138716)']
Top 3 features for Breast: ['ZFX (7543)', 'TRPS1 (7227)', 'UMPS (7372)']
Top 3 features for Pancreas: ['KRAS (3845)', 'RAF1 (5894)', 'RPP25L (138716)']
Top 3 features for Lymphoid: ['IRF4 (3662)', 'NMNAT1 (64802)', 'MBNL1 (4154)']
Top 3 features for CNS/Brain: ['RPP25L (138716)', 'PRKAR1A (5573)', 'ARHGEF7 (8874)']
Top 3 features for Soft Tissue: ['CDS2 (8760)', 'RPP25L (138716)', 'MYOD1 (4654)']
Top 3 features for Bone: ['FLI1 (2313)', 'SUB1 (10923)', 'CDK6 (1021)']
Top 3 features for Esophagus/Stomach: ['GPX4 (2879)', 'ATP6V0E1 (8992)', 'SLC39A10 (57181)']
Top 3 features for Peri

In [9]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Cell Lineage'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Cell Lineage,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene
0,Ovary/Fallopian Tube,PAX8 (7849),STRAP (11171),STRIP1 (85369)
1,Myeloid,MYB (4602),CBFB (865),LMO2 (4005)
2,Bowel,CTNNB1 (1499),TCF7L2 (6934),FERMT2 (10979)
3,Bladder/Urinary Tract,FAM50A (9130),PPARG (5468),SOX4 (6659)
4,Lung,NFE2L2 (4780),KNTC1 (9735),RPP25L (138716)
5,Breast,ZFX (7543),TRPS1 (7227),UMPS (7372)
6,Pancreas,KRAS (3845),RAF1 (5894),RPP25L (138716)
7,Lymphoid,IRF4 (3662),NMNAT1 (64802),MBNL1 (4154)
8,CNS/Brain,RPP25L (138716),PRKAR1A (5573),ARHGEF7 (8874)
9,Soft Tissue,CDS2 (8760),RPP25L (138716),MYOD1 (4654)
