In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LogisticRegression


import matplotlib.pyplot as plt

In [2]:
gene_effect_df = pd.read_csv("GeneExpression.csv").rename(columns = {'Unnamed: 0': 'ModelID'})
model_df = pd.read_csv("Model.csv")

merged_df = pd.merge(gene_effect_df, model_df[['ModelID', 'OncotreePrimaryDisease']], on='ModelID', how='inner')

new_cols = gene_effect_df.columns.tolist()
new_cols.insert(1, "OncotreePrimaryDisease")

gene_df = merged_df[new_cols]

In [3]:
categorical_mapping = {item: idx for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}
gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)

gene_df = gene_df.fillna(0)

gene_df = gene_df.set_index('ModelID')

# gene_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_df['Disease as Number'] = gene_df['OncotreePrimaryDisease'].map(categorical_mapping)


In [4]:
X = gene_df.drop(columns = ['Disease as Number', 'OncotreePrimaryDisease'])
y = gene_df['Disease as Number']

In [5]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.1)

model_log = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 1000)
model_log.fit(X_train_log, y_train_log)

y_pred_log = model_log.predict(X_test_log)

accuracy_log = accuracy_score(y_test_log, y_pred_log)
print(accuracy_log)

0.777027027027027


In [6]:
kf = KFold(n_splits = 5, random_state = 1, shuffle=True)

cv_scores_log = cross_val_score(model_log, X, y, cv=kf)
avg_score_log = np.mean(cv_scores_log)
print(cv_scores_log)
print(avg_score_log)

[0.74662162 0.77702703 0.76013514 0.72297297 0.73559322]
0.748469995419148


In [10]:
reverse_map = {idx: item for idx, item in enumerate(list(gene_df['OncotreePrimaryDisease'].unique()))}

coef_map = {}

coef = list(model_log.coef_)

for i in range(75):
    # Get indices of top 5 coefficients
    top_5_indices = sorted(range(len(coef[i])), key=lambda x: abs(coef[i][x]), reverse=True)[:5]
    # Map top 5 coefficients to their corresponding features
    top_5_features = [X.columns[index] for index in top_5_indices]
    # Store in coef_map
    coef_map[reverse_map[i]] = top_5_features

# Print the coef_map
for category, top_5_features in coef_map.items():
    print(f"Top 5 features for {category}: {top_5_features}")

Top 5 features for Non-Small Cell Lung Cancer: ['XAGE1A (653220)', 'XAGE1B (653067)', 'RNF212 (285498)', 'BST2 (684)', 'HOXB2 (3212)']
Top 5 features for Embryonal Tumor: ['CHRNA9 (55584)', 'MGP (4256)', 'TSTD1 (100131187)', 'S100A4 (6275)', 'HENMT1 (113802)']
Top 5 features for Melanoma: ['UCHL1 (7345)', 'MEST (4232)', 'LCP1 (3936)', 'S100B (6285)', 'GJA1 (2697)']
Top 5 features for Intraductal Papillary Neoplasm of the Bile Duct: ['SPP1 (6696)', 'BMP4 (652)', 'PXDN (7837)', 'CFH (3075)', 'CPE (1363)']
Top 5 features for Bladder Urothelial Carcinoma: ['LCP1 (3936)', 'RPL39L (116832)', 'IFITM2 (10581)', 'HOXA9 (3205)', 'PXDN (7837)']
Top 5 features for Colorectal Adenocarcinoma: ['HOXA9 (3205)', 'PITX2 (5308)', 'RAB34 (83871)', 'HOXB8 (3218)', 'OSMR (9180)']
Top 5 features for Mature T and NK Neoplasms: ['GZMB (3002)', 'CD96 (10225)', 'PTPRC (5788)', 'TNFRSF8 (943)', 'SLA (6503)']
Top 5 features for Liposarcoma: ['MDM2 (4193)', 'MARCHF9 (92979)', 'CDK4 (1019)', 'B4GALNT1 (2583)', 'TSPA

In [13]:
df_coef = pd.DataFrame(coef_map)

df_coef = df_coef.transpose()

df_coef.fillna('', inplace=True)

df_coef.reset_index(inplace=True)

df_coef.rename(columns={'index': 'Primary Disease'}, inplace=True)

new_columns = {col: f"#{i+1} Most Weighted RNA" for i, col in enumerate(df_coef.columns[1:])}
df_coef.rename(columns=new_columns, inplace=True)

df_coef

Unnamed: 0,Primary Disease,#1 Most Weighted RNA,#2 Most Weighted RNA,#3 Most Weighted RNA,#4 Most Weighted RNA,#5 Most Weighted RNA
0,Non-Small Cell Lung Cancer,XAGE1A (653220),XAGE1B (653067),RNF212 (285498),BST2 (684),HOXB2 (3212)
1,Embryonal Tumor,CHRNA9 (55584),MGP (4256),TSTD1 (100131187),S100A4 (6275),HENMT1 (113802)
2,Melanoma,UCHL1 (7345),MEST (4232),LCP1 (3936),S100B (6285),GJA1 (2697)
3,Intraductal Papillary Neoplasm of the Bile Duct,SPP1 (6696),BMP4 (652),PXDN (7837),CFH (3075),CPE (1363)
4,Bladder Urothelial Carcinoma,LCP1 (3936),RPL39L (116832),IFITM2 (10581),HOXA9 (3205),PXDN (7837)
...,...,...,...,...,...,...
70,Prostate Small Cell Carcinoma,FGB (2244),SPP1 (6696),ALDH1A1 (216),SLC17A3 (10786),CXCL5 (6374)
71,Epithelioid Sarcoma,MMP1 (4312),CXCL6 (6372),SMR3B (10879),MMP13 (4322),TIE1 (7075)
72,Poorly Differentiated Thyroid Cancer,CALCA (796),SST (6750),TSPAN8 (7103),TPH1 (7166),GCNT3 (9245)
73,Pancreatic Neuroendocrine Tumor,HBG2 (3048),PRG2 (5553),HLA-DRA (3122),FAM178B (51252),AVP (551)
