In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression


import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("GeneCN.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID', 'OncotreePrimaryDisease']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreePrimaryDisease")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}
gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)


In [4]:
X = gene_df.drop(columns = ['Disease as Number', 'OncotreePrimaryDisease'])
y = gene_df['Disease as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.4514285714285714


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.47142857 0.46       0.43714286 0.44285714 0.45428571]
0.4531428571428572


In [38]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(80):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:5]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

for category, top_5_features in coef_map.items():
    print(f"Top 5 features for {category}: {top_5_features}")

Top 5 features for Lung Neuroendocrine Tumor: ['OR2T10 (127069)', 'OR2T11 (127077)', 'MYC (4609)', 'EYA1 (2138)', 'CASC11 (100270680)']
Top 5 features for Head and Neck Squamous Cell Carcinoma: ['MIR1268A (100302233)', 'RNVU1-17 (101954269)', 'RNVU1-18 (26863)', 'CHEK2P2 (646096)', 'CDH10 (1008)']
Top 5 features for Ovarian Epithelial Tumor: ['ORM1 (5004)', 'KLRC2 (3822)', 'UGT2B28 (54490)', 'KIR2DL4 (3805)', 'OR2T10 (127069)']
Top 5 features for Invasive Breast Carcinoma: ['OR2T10 (127069)', 'OR2T11 (127077)', 'MIR3675 (100500876)', 'FAM230C (26080)', 'CHEK2P2 (646096)']
Top 5 features for Osteosarcoma: ['FAM230C (26080)', 'CCT8L2 (150160)', 'TPTEP1 (387590)', 'XKR3 (150165)', 'TPTE (7179)']
Top 5 features for Colorectal Adenocarcinoma: ['UGT2B17 (7367)', 'ACOT1 (641371)', 'CDKN2A (1029)', 'CDKN2B (1030)', 'ADAM6 (8755)']
Top 5 features for Non-Hodgkin Lymphoma: ['MIR4436A (100616399)', 'AC093655.1 (105375341)', 'ADAM6 (8755)', 'CCL4L2 (9560)', 'CCL3L3 (414062)']
Top 5 features for Me

In [45]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Primary Disease'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted Gene" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Primary Disease,#1 Most Weighted Gene,#2 Most Weighted Gene,#3 Most Weighted Gene,#4 Most Weighted Gene,#5 Most Weighted Gene
0,Lung Neuroendocrine Tumor,OR2T10 (127069),OR2T11 (127077),MYC (4609),EYA1 (2138),CASC11 (100270680)
1,Head and Neck Squamous Cell Carcinoma,MIR1268A (100302233),RNVU1-17 (101954269),RNVU1-18 (26863),CHEK2P2 (646096),CDH10 (1008)
2,Ovarian Epithelial Tumor,ORM1 (5004),KLRC2 (3822),UGT2B28 (54490),KIR2DL4 (3805),OR2T10 (127069)
3,Invasive Breast Carcinoma,OR2T10 (127069),OR2T11 (127077),MIR3675 (100500876),FAM230C (26080),CHEK2P2 (646096)
4,Osteosarcoma,FAM230C (26080),CCT8L2 (150160),TPTEP1 (387590),XKR3 (150165),TPTE (7179)
...,...,...,...,...,...,...
75,Acute Leukemias of Ambiguous Lineage,GLI2 (2736),LINC00114 (400866),AP001042.3 (101928398),LINC01700 (101928435),ETS2 (2114)
76,Poorly Differentiated Thyroid Cancer,CFHR1 (3078),CFHR3 (10878),ADAM6 (8755),TUBA3E (112714),MTAP (4507)
77,Myelodysplastic Syndromes,CFHR3 (10878),CFHR1 (3078),MIR4477B (100616194),MIR876 (100126310),MIR873 (100126316)
78,"Head and Neck Carcinoma, Other",ALDH8A1 (64577),MIR3662 (100500880),HBS1L (10767),MYB (4602),MIR548A2 (693126)
