In [13]:
import pandas as pd

expr = pd.read_csv("E-MTAB-3610_matrix.csv", sep=",", index_col=0)
ic50 = pd.read_csv("gdsc1_ic50_cleaned.csv", sep=",", index_col=False)



In [14]:
expr = expr.transpose()
expr.reset_index(inplace=True)
expr.rename(columns={"index": "CELL_LINE_NAME"}, inplace=True)


In [15]:
expr["CELL_LINE_NAME"] = expr["CELL_LINE_NAME"].str.upper().str.strip()
ic50["CELL_LINE_NAME"] = ic50["CELL_LINE_NAME"].str.upper().str.strip()


In [16]:
top_cancer_genes = [
    "TP53", "EGFR", "BRCA1", "BRCA2", "KRAS", "PIK3CA", "PTEN", "ALK", "BRAF", "MYC",
    "CDKN2A", "RB1", "ARID1A", "CTNNB1", "ATM", "IDH1", "IDH2", "SMAD4", "NRAS", "VHL",
    "APC", "MDM2", "FGFR1", "FGFR2", "FGFR3", "NF1", "MET", "ERBB2", "ERBB3", "CDH1",
    "NTRK1", "NTRK2", "NTRK3", "AKT1", "AKT2", "AKT3", "MTOR", "PDGFRA", "KIT", "ROS1",
    "RET", "CHEK2", "MLH1", "MSH2", "MSH6", "TSC1", "TSC2", "CDK4", "CDK6", "CCND1",
    "NOTCH1", "NOTCH2", "GNAS", "SMARCA4", "SMARCB1", "STK11", "EZH2", "EP300", "CREBBP", 
    "FOXA1", "GATA3", "MAP2K1", "MAP2K2", "HRAS", "NFE2L2", "POLE", "FANCA", "FANCD2",
    "SUFU", "TERT", "WT1", "ZFHX3", "CHD4", "FAT1", "HNF1A", "KMT2A", "KMT2C", "KMT2D",
    "AR", "PDCD1", "CD274", "JAK1", "JAK2", "IL7R", "SOCS1", "STAT3", "CXCR4", "TNFAIP3",
    "BCL2", "BCL6", "CD79B", "TNFRSF14", "TRAF3", "NFKBIA", "NFKB2", "REL", "FOXP1", "IKZF1"
]
selected_genes = ["CELL_LINE_NAME"] + [gene for gene in top_cancer_genes if gene in expr.columns]

expr = expr[selected_genes]


In [17]:
data = pd.merge(ic50, expr, on="CELL_LINE_NAME")
print("Merged shape:", data.shape)


Merged shape: (63937, 101)


In [18]:
data['labels'] = data.groupby('DRUG_NAME')['LN_IC50'].transform(lambda x: (x < x.median()).astype(int))


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

models = {}
genes = [g for g in expr.columns if g != "CELL_LINE_NAME"]

for drug in data['DRUG_NAME'].unique():
    subset = data[data['DRUG_NAME'] == drug]
    X = subset[genes]
    y = subset['labels']

    if len(y.unique()) < 2:
        print(f"Skipping drug {drug} due to insufficient classes.")
        continue
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"{str(drug):<20} Accuracy: {acc:.2f}")
    
    models[drug] = model

Erlotinib            Accuracy: 0.57
Rapamycin            Accuracy: 0.67
Sunitinib            Accuracy: 0.57
PHA-665752           Accuracy: 0.57
MG-132               Accuracy: 0.36
Paclitaxel           Accuracy: 0.38
Cyclopamine          Accuracy: 0.57
AZ628                Accuracy: 0.50
Sorafenib            Accuracy: 0.21
Tozasertib           Accuracy: 0.54
Imatinib             Accuracy: 0.43
NVP-TAE684           Accuracy: 0.64
Crizotinib           Accuracy: 0.36
Saracatinib          Accuracy: 0.57
S-Trityl-L-cysteine  Accuracy: 0.21
Z-LLNle-CHO          Accuracy: 0.43
Dasatinib            Accuracy: 0.50
GNF-2                Accuracy: 0.69
CGP-60474            Accuracy: 0.79
CGP-082996           Accuracy: 0.29
A-770041             Accuracy: 0.57
WH-4-023             Accuracy: 0.57
WZ-1-84              Accuracy: 0.64
BI-2536              Accuracy: 0.36
BMS-536924           Accuracy: 0.49
BMS-509744           Accuracy: 0.57
CMK                  Accuracy: 0.43
Pyrimethamine        Accurac

In [48]:
tumor_sample = {
    "TP53": 8.2, "EGFR": 6.3, "KRAS": 7.2, "BRCA1": 4.8,
    "PIK3CA": 11.2, "BRAF": 5.6, "MYC": 8.8, "PTEN": 6.9,
    "CDKN2A": 6.1, "RB1": 6.5
}

for g in genes:
    tumor_sample.setdefault(g, 0)

X_new = pd.DataFrame([tumor_sample])[genes]

In [49]:
results = []

for drug, model in models.items():
    prob = model.predict_proba(X_new)[0][1]
    results.append((drug, prob))

results = sorted(results, key=lambda x: x[1], reverse=True)
print("\nTop recomended treatments:")
for drug, score in results[:10]:
    print(f"{drug:<20} Predicted Sensitivity: {score:.2%}")


Top recomended treatments:
Rapamycin            Predicted Sensitivity: 65.00%
NSC319726            Predicted Sensitivity: 63.00%
FR-180204            Predicted Sensitivity: 62.00%
AZD2014              Predicted Sensitivity: 62.00%
MIM1                 Predicted Sensitivity: 61.00%
Methotrexate         Predicted Sensitivity: 61.00%
Vismodegib           Predicted Sensitivity: 61.00%
Palbociclib          Predicted Sensitivity: 61.00%
Erlotinib            Predicted Sensitivity: 60.00%
Masitinib            Predicted Sensitivity: 60.00%


In [50]:
# Save the models
import joblib
joblib.dump(models, "models.pkl")
joblib.dump(genes, "genes.pkl")

['genes.pkl']