In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression


import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("CRISPRGeneDependency.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID', 'OncotreePrimaryDisease']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreePrimaryDisease")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}
gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)


In [4]:
X = gene_df.drop(columns = ['Disease as Number', 'OncotreePrimaryDisease'])
y = gene_df['Disease as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.6


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.61363636 0.55       0.62727273 0.62272727 0.62727273]
0.6081818181818182


In [8]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(72):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:5]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

# Print the coef_map
for category, top_5_features in coef_map.items():
    print(f"Top 5 features for {category}: {top_5_features}")

Top 5 features for Ovarian Epithelial Tumor: ['PAX8 (7849)', 'PARD6B (84612)', 'STRAP (11171)', 'PARD3 (56288)', 'STRIP1 (85369)']
Top 5 features for Acute Myeloid Leukemia: ['MYB (4602)', 'SPI1 (6688)', 'CCND1 (595)', 'ZEB2 (9839)', 'CBFB (865)']
Top 5 features for Colorectal Adenocarcinoma: ['CTNNB1 (1499)', 'TCF7L2 (6934)', 'CHMP4B (128866)', 'FERMT2 (10979)', 'GPX4 (2879)']
Top 5 features for Bladder Urothelial Carcinoma: ['SOX4 (6659)', 'FAM50A (9130)', 'PPARG (5468)', 'ZNF143 (7702)', 'RXRA (6256)']
Top 5 features for Non-Small Cell Lung Cancer: ['NFE2L2 (4780)', 'UROD (7389)', 'SMARCA2 (6595)', 'RPP25L (138716)', 'TMX2 (51075)']
Top 5 features for Invasive Breast Carcinoma: ['ZFX (7543)', 'UMPS (7372)', 'CUL2 (8453)', 'GATA3 (2625)', 'SPDEF (25803)']
Top 5 features for Pancreatic Adenocarcinoma: ['KRAS (3845)', 'FLII (2314)', 'RAB10 (10890)', 'DOCK5 (80005)', 'DYNLL1 (8655)']
Top 5 features for Mature B-Cell Neoplasms: ['POU2AF1 (5450)', 'GRB2 (2885)', 'PIM2 (11040)', 'MBNL1 (41

In [9]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Primary Disease'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Primary Disease,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene,#4 Most Weighted Gene,#5 Most Weighted Gene
0,Ovarian Epithelial Tumor,PAX8 (7849),PARD6B (84612),STRAP (11171),PARD3 (56288),STRIP1 (85369)
1,Acute Myeloid Leukemia,MYB (4602),SPI1 (6688),CCND1 (595),ZEB2 (9839),CBFB (865)
2,Colorectal Adenocarcinoma,CTNNB1 (1499),TCF7L2 (6934),CHMP4B (128866),FERMT2 (10979),GPX4 (2879)
3,Bladder Urothelial Carcinoma,SOX4 (6659),FAM50A (9130),PPARG (5468),ZNF143 (7702),RXRA (6256)
4,Non-Small Cell Lung Cancer,NFE2L2 (4780),UROD (7389),SMARCA2 (6595),RPP25L (138716),TMX2 (51075)
...,...,...,...,...,...,...
67,Glassy Cell Carcinoma of the Cervix,FDFT1 (2222),DUSP4 (1846),IRF4 (3662),BRAF (673),MITF (4286)
68,Mucosal Melanoma of the Vulva/Vagina,GJA3 (2700),YRDC (79693),KDM8 (79831),SAFB (6294),TRIR (79002)
69,Nerve Sheath Tumor,EIF2B5 (8893),MED22 (6837),RPLP2 (6181),TFAP2C (7022),DERL1 (79139)
70,Extra Gonadal Germ Cell Tumor,GNB1 (2782),EDF1 (8721),ERBB3 (2065),FIS1 (51024),ATP6V0D1 (9114)
