In [1]:
import pandas as pd
from src import DatasetManager, MinMaxScaling, IdentityLineFeaturizer, IdentityDrugFeaturizer, IdentityPipeline
from GDSC import GDSCPreprocessingPipeline, GDSCRawPreprocessingPipeline
from PRISM import PRISMPreprocessingPipeline
from CTRPv2 import CTRPv2PreprocessingPipeline
from NI60 import NI60PreprocessingPipeline
import os
import numpy as np
from fingerprints import FingerprintFeaturizer



In [2]:
paccmann_genes = pd.read_csv("https://raw.githubusercontent.com/prassepaul/mlmed_ranking/main/data/gdsc_data/paccmann_gene_list.txt", index_col=None, header=None).to_numpy().squeeze().tolist()

In [3]:
manager = DatasetManager(processing_pipeline = GDSCPreprocessingPipeline(target = "LN_IC50",
                                                                         gene_subset = paccmann_genes,
                                                                        cell_lines = "expression&mutation"),
                        target_processor = IdentityPipeline(),
                        partition_column = "DRUG_ID",
                        k = 25,
                        drug_featurizer = FingerprintFeaturizer(),
                        line_featurizer = IdentityLineFeaturizer())

In [4]:
line_dict = manager.get_cell_lines()
drug_dict = manager.get_drugs()

In [5]:
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr
from tqdm import tqdm

In [6]:
rs = []
for i in tqdm(range(25)):
    train, val, test = manager.get_partition(i)
    data_train = manager.get_tabular_dataset(train, line_dict, drug_dict)
    data_val = manager.get_tabular_dataset(val, line_dict, drug_dict)
    X_train = data_train.drop("Y", axis=1)
    y_train = data_train.loc[:, "Y"]
    X_val = data_val.drop("Y", axis=1)
    y_val = data_val.loc[:, "Y"]
    clf = Ridge()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    r = pearsonr(y_pred, y_val)
    rs += [r]

100%|███████████████████████████████████████████| 25/25 [12:24<00:00, 29.80s/it]


In [9]:
print(f"Cross-validated R (observed-predicted) using Ridge: {np.array(rs)[:, 0].mean()}")

Cross-validated R (observed-predicted) using Ridge: 0.4131462927113253
