In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("GeneMutation.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID','CellLineName','OncotreeLineage']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreeLineage")
# new_cols.insert(2, "CellLineName")

gene_df = merged_df[new_cols]

  gene_effect_df = pd.read_csv("GeneMutation.csv").rename(columns = {'Unnamed: 0': 'ModelID'})


In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}
gene_df['Lineage as Number'] = gene_df['OncotreeLineage'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

In [4]:
X = gene_df.drop(columns = ['Lineage as Number', 'OncotreeLineage'])
y = gene_df['Lineage as Number']

In [11]:
categorical_cols = X.select_dtypes(include=['object']).columns
X = X.drop(columns = categorical_cols)

In [12]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.1680303871965409


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Couldn't complete due to long run time
# kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

# cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
# avg_score_log = np.mean(cv_scores_log)
# print(cv_scores_log)
# print(avg_score_log)

In [15]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreeLineage'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(30):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:3]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

for category, top_5_features in coef_map.items():
    print(f"Top 3 features for {category}: {top_5_features}")


Top 3 features for Lung: ['Pos', 'GwasPmID', 'EntrezGeneID']
Top 3 features for Esophagus/Stomach: ['Pos', 'GwasPmID', 'EntrezGeneID']
Top 3 features for Breast: ['GwasPmID', 'Pos', 'EntrezGeneID']
Top 3 features for Head and Neck: ['Pos', 'EntrezGeneID', 'GwasPmID']
Top 3 features for Prostate: ['GwasPmID', 'PS', 'Pos']
Top 3 features for CNS/Brain: ['GwasPmID', 'Pos', 'EntrezGeneID']
Top 3 features for Ovary/Fallopian Tube: ['Pos', 'EntrezGeneID', 'PS']
Top 3 features for Liver: ['GwasPmID', 'Pos', 'EntrezGeneID']
Top 3 features for Uterus: ['Pos', 'GwasPmID', 'PS']
Top 3 features for Bowel: ['Pos', 'EntrezGeneID', 'GwasPmID']
Top 3 features for Skin: ['Pos', 'EntrezGeneID', 'GwasPmID']
Top 3 features for Kidney: ['GwasPmID', 'Pos', 'PS']
Top 3 features for Lymphoid: ['Pos', 'GwasPmID', 'EntrezGeneID']
Top 3 features for Bladder/Urinary Tract: ['GwasPmID', 'Pos', 'EntrezGeneID']
Top 3 features for Testis: ['Pos', 'EntrezGeneID', 'GwasPmID']
Top 3 features for Eye: ['GwasPmID', 'Pos',

In [16]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Cell Lineage'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Cell Lineage,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene
0,Lung,Pos,GwasPmID,EntrezGeneID
1,Esophagus/Stomach,Pos,GwasPmID,EntrezGeneID
2,Breast,GwasPmID,Pos,EntrezGeneID
3,Head and Neck,Pos,EntrezGeneID,GwasPmID
4,Prostate,GwasPmID,PS,Pos
5,CNS/Brain,GwasPmID,Pos,EntrezGeneID
6,Ovary/Fallopian Tube,Pos,EntrezGeneID,PS
7,Liver,GwasPmID,Pos,EntrezGeneID
8,Uterus,Pos,GwasPmID,PS
9,Bowel,Pos,EntrezGeneID,GwasPmID
