In [2]:
import pandas as pd

expr = pd.read_csv("E-MTAB-3610_matrix.csv", sep=",", index_col=0)
ic50 = pd.read_csv("gdsc1_ic50_cleaned.csv", sep=",", index_col=False)



In [3]:
expr = expr.transpose()
expr.reset_index(inplace=True)
expr.rename(columns={"index": "CELL_LINE_NAME"}, inplace=True)


In [4]:
expr["CELL_LINE_NAME"] = expr["CELL_LINE_NAME"].str.upper().str.strip()
ic50["CELL_LINE_NAME"] = ic50["CELL_LINE_NAME"].str.upper().str.strip()


In [5]:
top_cancer_genes = [
    "TP53", "EGFR", "BRCA1", "BRCA2", "KRAS", "PIK3CA", "PTEN", "ALK", "BRAF", "MYC",
    "CDKN2A", "RB1", "ARID1A", "CTNNB1", "ATM", "IDH1", "IDH2", "SMAD4", "NRAS", "VHL",
    "APC", "MDM2", "FGFR1", "FGFR2", "FGFR3", "NF1", "MET", "ERBB2", "ERBB3", "CDH1",
    "NTRK1", "NTRK2", "NTRK3", "AKT1", "AKT2", "AKT3", "MTOR", "PDGFRA", "KIT", "ROS1",
    "RET", "CHEK2", "MLH1", "MSH2", "MSH6", "TSC1", "TSC2", "CDK4", "CDK6", "CCND1",
    "NOTCH1", "NOTCH2", "GNAS", "SMARCA4", "SMARCB1", "STK11", "EZH2", "EP300", "CREBBP", 
    "FOXA1", "GATA3", "MAP2K1", "MAP2K2", "HRAS", "NFE2L2", "POLE", "FANCA", "FANCD2",
    "SUFU", "TERT", "WT1", "ZFHX3", "CHD4", "FAT1", "HNF1A", "KMT2A", "KMT2C", "KMT2D",
    "AR", "PDCD1", "CD274", "JAK1", "JAK2", "IL7R", "SOCS1", "STAT3", "CXCR4", "TNFAIP3",
    "BCL2", "BCL6", "CD79B", "TNFRSF14", "TRAF3", "NFKBIA", "NFKB2", "REL", "FOXP1", "IKZF1"
]
selected_genes = ["CELL_LINE_NAME"] + [gene for gene in top_cancer_genes if gene in expr.columns]

expr = expr[selected_genes]


In [6]:
data = pd.merge(ic50, expr, on="CELL_LINE_NAME")
print("Merged shape:", data.shape)


Merged shape: (63937, 101)


In [7]:
data['labels'] = data.groupby('DRUG_NAME')['LN_IC50'].transform(lambda x: (x < x.median()).astype(int))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

models1 = {}

genes = [g for g in expr.columns if g != "CELL_LINE_NAME" and g not in ["DRUG_NAME", "LN_IC50"]]

for drug in data['DRUG_NAME'].unique():
    subset = data[data['DRUG_NAME'] == drug]
    X = subset[genes]
    y = subset['labels']

    if len(y.unique()) < 2:
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_index = X_train.index
    test_index = X_test.index

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=genes, index=X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=genes, index=X_test.index)


    model = LogisticRegression(max_iter=1000, solver='lbfgs')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"{str(drug):<20} Accuracy: {acc:.2f}")
    
    models1[drug] = (model, scaler)

Erlotinib            Accuracy: 0.57
Rapamycin            Accuracy: 0.75
Sunitinib            Accuracy: 0.43
PHA-665752           Accuracy: 0.50
MG-132               Accuracy: 0.50
Paclitaxel           Accuracy: 0.23
Cyclopamine          Accuracy: 0.57
AZ628                Accuracy: 0.50
Sorafenib            Accuracy: 0.21
Tozasertib           Accuracy: 0.46
Imatinib             Accuracy: 0.29
NVP-TAE684           Accuracy: 0.79
Crizotinib           Accuracy: 0.29
Saracatinib          Accuracy: 0.57
S-Trityl-L-cysteine  Accuracy: 0.43
Z-LLNle-CHO          Accuracy: 0.29
Dasatinib            Accuracy: 0.50
GNF-2                Accuracy: 0.54
CGP-60474            Accuracy: 0.64
CGP-082996           Accuracy: 0.29
A-770041             Accuracy: 0.71
WH-4-023             Accuracy: 0.64
WZ-1-84              Accuracy: 0.57
BI-2536              Accuracy: 0.43
BMS-536924           Accuracy: 0.63
BMS-509744           Accuracy: 0.50
CMK                  Accuracy: 0.14
Pyrimethamine        Accurac

In [49]:
# random forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

models2 = {}

genes = [g for g in expr.columns if g != "CELL_LINE_NAME" and g not in ["DRUG_NAME", "LN_IC50"]]

for drug in data['DRUG_NAME'].unique():
    subset = data[data['DRUG_NAME'] == drug]
    X = subset[genes]
    y = subset['labels']

    if len(y.unique()) < 2:
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_index = X_train.index
    test_index = X_test.index

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=genes, index=X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=genes, index=X_test.index)


    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, model.predict(X_test))
    print(f"{str(drug):<20} Accuracy: {acc:.2f}")
    
    models2[drug] = (model, scaler)

Erlotinib            Accuracy: 0.50
Rapamycin            Accuracy: 0.58
Sunitinib            Accuracy: 0.43
PHA-665752           Accuracy: 0.64
MG-132               Accuracy: 0.64
Paclitaxel           Accuracy: 0.38
Cyclopamine          Accuracy: 0.50
AZ628                Accuracy: 0.57
Sorafenib            Accuracy: 0.29
Tozasertib           Accuracy: 0.38
Imatinib             Accuracy: 0.50
NVP-TAE684           Accuracy: 0.64
Crizotinib           Accuracy: 0.21
Saracatinib          Accuracy: 0.71
S-Trityl-L-cysteine  Accuracy: 0.43
Z-LLNle-CHO          Accuracy: 0.50
Dasatinib            Accuracy: 0.71
GNF-2                Accuracy: 0.62
CGP-60474            Accuracy: 0.50
CGP-082996           Accuracy: 0.43
A-770041             Accuracy: 0.71
WH-4-023             Accuracy: 0.64
WZ-1-84              Accuracy: 0.50
BI-2536              Accuracy: 0.43
BMS-536924           Accuracy: 0.59
BMS-509744           Accuracy: 0.50
CMK                  Accuracy: 0.57
Pyrimethamine        Accurac

In [50]:
# Save the models
import joblib
joblib.dump(models2, "models.pkl")
joblib.dump(genes, "genes.pkl")

['genes.pkl']

In [62]:
tumor_sample = {
    "TP53": 3.0,     # LOW (loss of tumor suppression)
    "EGFR": 12.0,    # HIGH (epidermal growth factor receptor overexpressed)
    "KRAS": 10.5,    # HIGH (activated MAPK pathway)
    "BRCA1": 2.0,    # LOW (DNA repair defect)
    "PIK3CA": 8.5,   # HIGH (PI3K pathway activation)
    "BRAF": 4.0,     # Normal
    "MYC": 7.0,      # Moderate
    "PTEN": 3.5,     # LOW (tumor suppressor lost)
    "CDKN2A": 6.2,   # Normal
    "RB1": 6.0       # Normal
}

for g in genes:
    tumor_sample.setdefault(g, 0)

X_new = pd.DataFrame([tumor_sample])[genes]

In [63]:

bien = []
for drug, (model, scaler) in models2.items():
    prob = model.predict_proba(X)[0][1]
    bien.append((drug, prob))

    bien.sort(key=lambda x: x[1], reverse=True)
print("Top Predicted Treatments:")
for drug, score in bien[:10]:
    print(f"{drug} - Predicted Sensitivity: {score:.2%}")

Top Predicted Treatments:
Kobe2602 - Predicted Sensitivity: 69.00%
N24798-49-A1 - Predicted Sensitivity: 68.00%
CUDC-101 - Predicted Sensitivity: 67.00%
Tivozanib - Predicted Sensitivity: 67.00%
UNC1215 - Predicted Sensitivity: 67.00%
Ruxolitinib - Predicted Sensitivity: 66.00%
AST-1306 - Predicted Sensitivity: 66.00%
ACY-1215 - Predicted Sensitivity: 65.00%
CX-5461 - Predicted Sensitivity: 65.00%
PIK-93 - Predicted Sensitivity: 65.00%
