This notebook performs downstream Drug Response Prediction with the augmented cell line data and patient datasets. Uses a neural network to perform DRP.

In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader

In [3]:
import sys
sys.path.append("../src/tab_ddpm/")

In [4]:
device = torch.device("cuda:0")

#### Load Datasets

In [5]:
tcga_train_df = pd.read_csv("../data/diffusion_pretraining/tcga_diffusion_train_sample0.csv", index_col=0)
tcga_train_df.shape

(476, 7776)

In [6]:
tcga_test_df = pd.read_csv("../data/diffusion_pretraining/tcga_diffusion_test_sample0.csv", index_col=0)
tcga_test_df.shape

(120, 7776)

In [7]:
# augmented_cl_df = pd.read_csv("/data/ajayago/druid/intermediate/cs6220/augmented_cl_joint_UDA.csv", index_col=0)
augmented_cl_df = pd.read_csv("/data/ajayago/druid/intermediate/cs6220/augmented_cl_clconditioned.csv", index_col=0)
print(augmented_cl_df.shape)
augmented_cl_df

(1744, 7776)


Unnamed: 0,ABL1_piu_max,ACVR1B_piu_max,AKT1_piu_max,AKT2_piu_max,AKT3_piu_max,ALK_piu_max,ALOX12B_piu_max,APC_piu_max,AR_piu_max,ARAF_piu_max,...,U2AF1_benign_count,VEGFA_benign_count,VHL_benign_count,WHSC1_benign_count,WHSC1L1_benign_count,WT1_benign_count,XPO1_benign_count,XRCC2_benign_count,ZNF217_benign_count,ZNF703_benign_count
ACH-002277,-0.087760,0.032957,0.190852,-0.271057,0.194915,-0.012064,0.266465,-0.083992,0.042592,0.113135,...,0.270248,0.103604,-0.021781,0.160761,0.015747,-0.002556,0.234677,-0.070766,-0.124407,-0.184209
ACH-000046,-0.202440,0.228575,0.069315,0.095176,0.068469,-0.148772,-0.023000,0.280832,0.106937,-0.233486,...,0.041994,-0.113767,0.268398,0.173079,0.183591,-0.232520,-0.271787,-0.086695,0.016713,-0.224163
ACH-000674,-0.058647,0.015405,-0.004479,0.070857,-0.353355,0.029231,0.223145,0.241796,0.071190,0.397939,...,0.213629,0.251534,-0.083437,-0.077655,-0.004640,-0.048375,0.219733,-0.199291,-0.044677,0.005390
ACH-000557,-0.070440,-0.307093,0.047039,-0.129194,-0.069649,0.148327,0.112893,-0.201333,0.062023,0.265155,...,0.051457,0.337542,-0.300728,-0.305295,0.279003,0.141817,-0.175442,-0.096936,-0.175513,-0.167278
ACH-001642,-0.115449,-0.172094,0.130430,-0.138235,0.186060,-0.084510,0.301509,-0.214375,-0.287137,-0.195233,...,0.114214,0.011537,-0.168359,-0.132326,-0.156321,-0.087420,-0.227914,-0.064403,-0.039033,-0.012837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-002163,-0.007283,0.180049,-0.166796,0.027630,-0.109326,0.036422,-0.035981,-0.000312,0.121353,-0.144500,...,-0.212862,0.116564,-0.041704,0.189265,0.271412,-0.124896,0.136480,-0.165535,-0.374068,-0.115348
ACH-001616,0.132993,0.227633,0.209337,-0.187066,-0.053906,-0.060422,0.268335,0.022922,-0.153017,-0.045944,...,0.095875,0.408897,-0.222391,-0.104127,0.194359,-0.229589,-0.024475,0.102094,-0.051742,0.171267
ACH-001532,0.085234,-0.062251,-0.115823,0.220286,-0.047148,0.207629,0.018631,-0.064137,0.220210,-0.272371,...,-0.072619,-0.031038,-0.092965,-0.071206,-0.167836,-0.040338,0.044246,0.025731,0.019505,-0.065175
ACH-000993,-0.216540,-0.238687,0.096561,-0.162753,0.218944,0.076891,-0.018378,-0.141628,0.030709,0.076241,...,-0.265679,0.123608,-0.023183,-0.021311,0.259382,0.194676,0.146829,0.168473,-0.187869,0.333681


#### Load response files

In [8]:
cl_responses_df = pd.read_csv("/data/ajayago/copied_from_cdal1/yiming_data_folder/dataset/CellLine/patient_auc.csv")
cl_responses_df

Unnamed: 0,depmap_id,drug_name,auc
0,ACH-000001,JW-7-24-1,0.778432
1,ACH-000002,JW-7-24-1,0.788327
2,ACH-000004,JW-7-24-1,0.737630
3,ACH-000006,JW-7-24-1,0.176396
4,ACH-000007,JW-7-24-1,0.569751
...,...,...,...
226791,ACH-001702,SL 0101-1,0.958591
226792,ACH-001703,SL 0101-1,0.497201
226793,ACH-001711,SL 0101-1,0.815928
226794,ACH-001715,SL 0101-1,0.981441


In [9]:
patient_responses_df = pd.read_csv("/data/ajayago/copied_from_cdal1/ajayago_home_folder/processed/TCGA_drug_response_010222.csv")
patient_responses_df

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EC,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EC,Doxorubicin,Partial Response,1,DOXORUBICIN
2,TCGA-G2-A2EC,Vinblastine,Partial Response,1,VINBLASTINE
3,TCGA-G2-A2EC,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-G2-A2EJ,Paclitaxel,Stable Disease,0,PACLITAXEL
...,...,...,...,...,...
1244,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
1245,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL
1246,TCGA-BG-A0VZ,Doxorubicin,Complete Response,1,DOXORUBICIN
1247,TCGA-BG-A0VT,Carboplatin,Complete Response,1,CARBOPLATIN


In [10]:
patient_responses_df["drug"].value_counts()

drug
CISPLATIN             206
5-FLUOROURACIL        125
CARBOPLATIN           117
PACLITAXEL            113
CYCLOPHOSPHAMIDE       69
                     ... 
E7389                   1
CARMUSTINE              1
TEMOZOLOMIDE            1
ERIBULIN                1
METHYLPREDNISOLONE      1
Name: count, Length: 69, dtype: int64

In [11]:
patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_test_df.index)].drug.value_counts()

drug
CISPLATIN           44
5-FLUOROURACIL      29
CARBOPLATIN         25
GEMCITABINE         16
PACLITAXEL          16
DOXORUBICIN         14
LEUCOVORIN          14
CYCLOPHOSPHAMIDE    13
DOCETAXEL           10
OXALIPLATIN          8
ETOPOSIDE            7
CAPECITABINE         7
VINBLASTINE          6
METHOTREXATE         5
CETUXIMAB            5
DACARBAZINE          5
IRINOTECAN           3
TAMOXIFEN            3
EPIRUBICIN           3
VINORELBINE          3
GEFITINIB            2
DABRAFENIB           2
TRASTUZUMAB          2
LEUPROLIDE           1
PAMIDRONATE          1
IFOSFAMIDE           1
MITOMYCIN-C          1
PEMETREXED           1
REGORAFENIB          1
VEMURAFENIB          1
FOTEMUSTINE          1
LOMUSTINE            1
TRAMETINIB           1
AXITINIB             1
CARMUSTINE           1
Name: count, dtype: int64

In [12]:
patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_train_df.index)].drug.value_counts()

drug
CISPLATIN            162
PACLITAXEL            97
5-FLUOROURACIL        96
CARBOPLATIN           92
CYCLOPHOSPHAMIDE      56
                    ... 
FOTEMUSTINE            1
RESIQUIMOD             1
TOREMIFENE             1
ERIBULIN               1
MYCOPHENOLIC ACID      1
Name: count, Length: 66, dtype: int64

In [13]:
train_tcga_with_response = patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_train_df.index)].reset_index(drop=True)
train_tcga_with_response

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EJ,Paclitaxel,Stable Disease,0,PACLITAXEL
1,TCGA-G2-A2EJ,Mycophenolic acid,Stable Disease,0,MYCOPHENOLIC ACID
2,TCGA-G2-A2EJ,Cisplatin,Stable Disease,0,CISPLATIN
3,TCGA-G2-A2EJ,Ifosfamide,Stable Disease,0,IFOSFAMIDE
4,TCGA-G2-A2EF,Methotrexate,Partial Response,1,METHOTREXATE
...,...,...,...,...,...
990,TCGA-BG-A0VZ,Cisplatin,Complete Response,1,CISPLATIN
991,TCGA-BG-A0VZ,Paclitaxel,Complete Response,1,PACLITAXEL
992,TCGA-BG-A0VZ,Doxorubicin,Complete Response,1,DOXORUBICIN
993,TCGA-BG-A0VT,Carboplatin,Complete Response,1,CARBOPLATIN


In [14]:
test_tcga_with_response = patient_responses_df[patient_responses_df["patient.arr"].isin(tcga_test_df.index)].reset_index(drop=True)
test_tcga_with_response

Unnamed: 0,patient.arr,drug.name,response,response_cat,drug
0,TCGA-G2-A2EC,Methotrexate,Partial Response,1,METHOTREXATE
1,TCGA-G2-A2EC,Doxorubicin,Partial Response,1,DOXORUBICIN
2,TCGA-G2-A2EC,Vinblastine,Partial Response,1,VINBLASTINE
3,TCGA-G2-A2EC,Cisplatin,Partial Response,1,CISPLATIN
4,TCGA-BT-A2LA,Carboplatin,Complete Response,1,CARBOPLATIN
...,...,...,...,...,...
249,TCGA-A5-A1OH,Carboplatin,Complete Response,1,CARBOPLATIN
250,TCGA-A5-A3LP,Carboplatin,Complete Response,1,CARBOPLATIN
251,TCGA-AJ-A3OK,Carboplatin,Complete Response,1,CARBOPLATIN
252,TCGA-AJ-A3OK,Doxorubicin,Complete Response,1,DOXORUBICIN


In [15]:
drug_fp = pd.read_csv("/data/ajayago/copied_from_cdal1/ajayago_home_folder/processed/drug_morgan_fingerprints.csv", index_col = 0)
drug_fp

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
JW-7-24-1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KIN001-260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSC-87877,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
GNE-317,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NAVITOCLAX,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LGH447,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRASTUZUMAB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WNT974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRIFLURIDINE,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
set(train_tcga_with_response["drug"]) - set(drug_fp.index)

{'ALDESLEUKIN',
 'ANASTROZOLE',
 'BCG',
 'CAPECITABINE',
 'CARBOPLATIN',
 'CYCLOSPORINE',
 'DEXAMETHASONE',
 'DIDOX',
 'E7389',
 'ERIBULIN',
 'EXEMESTANE',
 'FOLFIRI',
 'FOLFOX',
 'FOTEMUSTINE',
 'GOSERELIN',
 'GP100',
 'HYDROCORTISONE',
 'IFOSFAMIDE',
 'LETROZOLE',
 'LEUCOVORIN',
 'LEUPROLIDE',
 'LOMUSTINE',
 'MELPHALAN',
 'METHYLPREDNISOLONE',
 'MYCOPHENOLIC ACID',
 'ONDANSETRON',
 'PAMIDRONATE',
 'PEGFILGRASTIM',
 'PNU-159548',
 'PX-866',
 'RESIQUIMOD',
 'TOREMIFENE',
 'VEMURAFENIB'}

In [17]:
set(test_tcga_with_response["drug"]) - set(drug_fp.index)

{'CAPECITABINE',
 'CARBOPLATIN',
 'FOTEMUSTINE',
 'IFOSFAMIDE',
 'LEUCOVORIN',
 'LEUPROLIDE',
 'LOMUSTINE',
 'PAMIDRONATE',
 'REGORAFENIB',
 'VEMURAFENIB'}

Plan:

* Baseline: Train a model on train patient data + drug fingerprints and predict the response for the (patient, drug) pair.
* Add all cell lines and assign pseudo labels using the patient model from baseline - concat both datasets and train another model.
* Weak supervision + subset selection on cell lines - use this with patient data to train another model

Evaluation: Classification metrics on the patient test dataset.

In [18]:
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

In [19]:
# Held out TCGA dataset
test_features = []
test_y = []
for idx, row in test_tcga_with_response.iterrows():
    if row["drug"] in drug_fp.index:
        row_inp = []
        row_inp.extend(tcga_test_df.loc[row["patient.arr"]].values)
        row_inp.extend(drug_fp.loc[row["drug"]].values)
        test_y.append(row["response_cat"])
        test_features.append(row_inp)

In [20]:
test_features_tensor = torch.tensor(test_features)
test_y_tensor = torch.tensor(test_y)
print(test_features_tensor.shape, test_y_tensor.shape)

torch.Size([201, 9824]) torch.Size([201])


#### Baseline Model

In [21]:
train_features_all = []
train_y_all = []
for idx, row in train_tcga_with_response.iterrows():
    if row["drug"] in drug_fp.index:
        row_inp = []
        row_inp.extend(tcga_train_df.loc[row["patient.arr"]].values)
        row_inp.extend(drug_fp.loc[row["drug"]].values)
        train_y_all.append(row["response_cat"])
        train_features_all.append(row_inp)

In [22]:
val_features = train_features_all[int(0.8 * len(train_features_all)):]
val_y = train_y_all[int(0.8 * len(train_features_all)):]
train_features = train_features_all[:int(0.8 * len(train_features_all))]
train_y = train_y_all[:int(0.8 * len(train_features_all))]

In [23]:
len(train_features), len(test_features), len(val_features)

(608, 201, 153)

In [24]:
train_features_tensor = torch.tensor(train_features)
train_y_tensor = torch.tensor(train_y)
print(train_features_tensor.shape, train_y_tensor.shape)

torch.Size([608, 9824]) torch.Size([608])


In [25]:
val_features_tensor = torch.tensor(val_features)
val_y_tensor = torch.tensor(val_y)
print(val_features_tensor.shape, val_y_tensor.shape)

torch.Size([153, 9824]) torch.Size([153])


In [26]:
class BaseLineNN(nn.Module):
    def __init__(self, in_dim = 9824, hidden_size=64, out_dim = 1):
        super().__init__()
        self.linear1 = nn.Linear(in_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, out_dim)

    def forward(self, x):
        x = self.linear1(x)
        x = nn.ReLU()(x)
        x = self.linear2(x)
        return x

In [27]:
nn_baseline = BaseLineNN()

In [28]:
criterion = nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(nn_baseline.parameters(), lr = 1e-6)

In [29]:
train_y_tensor.dtype

torch.int64

In [30]:
# training 
for i in range(100):
    optim.zero_grad()
    y_pred = nn_baseline(train_features_tensor.to(dtype=torch.float32))
    loss = criterion(y_pred, train_y_tensor.view(-1, 1).to(dtype=torch.float32))
    loss.backward()
    optim.step()

    with torch.no_grad():
        val_loss = criterion(nn_baseline(val_features_tensor.to(dtype=torch.float32)), val_y_tensor.view(-1, 1).to(dtype=torch.float32))

    print(f"Epoch {i}: Training loss: {loss.detach().item()} |  Validation loss: {val_loss.detach().item()}")

Epoch 0: Training loss: 0.6907088756561279 |  Validation loss: 0.6894047856330872
Epoch 1: Training loss: 0.6906402707099915 |  Validation loss: 0.689350962638855
Epoch 2: Training loss: 0.6905719041824341 |  Validation loss: 0.6892969012260437
Epoch 3: Training loss: 0.6905034780502319 |  Validation loss: 0.6892428994178772
Epoch 4: Training loss: 0.6904351711273193 |  Validation loss: 0.6891888380050659
Epoch 5: Training loss: 0.6903668642044067 |  Validation loss: 0.6891348361968994
Epoch 6: Training loss: 0.6902985572814941 |  Validation loss: 0.6890810132026672
Epoch 7: Training loss: 0.6902304291725159 |  Validation loss: 0.6890270709991455
Epoch 8: Training loss: 0.6901623010635376 |  Validation loss: 0.6889733076095581
Epoch 9: Training loss: 0.6900942921638489 |  Validation loss: 0.6889195442199707
Epoch 10: Training loss: 0.6900264024734497 |  Validation loss: 0.6888658404350281
Epoch 11: Training loss: 0.689958393573761 |  Validation loss: 0.6888121962547302
Epoch 12: Traini

In [31]:
# inference
nn_baseline.eval()
y_test_pred = nn.Sigmoid()(nn_baseline(test_features_tensor.to(dtype=torch.float32)))
y_test_pred.shape

torch.Size([201, 1])

In [32]:
predictions = y_test_pred.detach().numpy().reshape(-1)
predictions.shape

(201,)

In [33]:
auroc = roc_auc_score(test_y, predictions)
auprc = average_precision_score(test_y, predictions)
print(f"AUROC: {auroc}, AUPRC: {auprc}")

AUROC: 0.5573959052219922, AUPRC: 0.7141680705382046


#### Model with all cell lines + pseudo labels and train patient data

In [34]:
cl_aug_train_features = []
cl_aug_train_y = []
for idx, row in cl_responses_df.iterrows():
    if row["drug_name"] in drug_fp.index:
        row_inp = []
        row_inp.extend(augmented_cl_df.loc[row["depmap_id"]].values)
        row_inp.extend(drug_fp.loc[row["drug_name"]].values)
        cl_aug_train_y.append(row["auc"])
        cl_aug_train_features.append(row_inp)

In [35]:
len(cl_aug_train_features)

226796

In [68]:
cl_aug_train_features_tensor = torch.tensor(cl_aug_train_features)
cl_aug_train_features_tensor.shape

torch.Size([226796, 9824])

In [69]:
# np.save("/data/ajayago/druid/intermediate/cs6220/augmented_cl_features_rf_joint_UDA.npy", np.array(cl_aug_train_features))

In [70]:
# Assign pseudo labels for cell lines using above NN Model
nn_baseline.eval()
cl_aug_train_y_pseudo = nn.Sigmoid()(nn_baseline(cl_aug_train_features_tensor.to(dtype=torch.float32))).detach()
cl_aug_train_y_pseudo.shape

torch.Size([226796, 1])

In [71]:
# pd.DataFrame(cl_aug_train_y_pseudo).to_csv("/data/ajayago/druid/intermediate/cs6220/augmented_cl_pseudolabels_rf_joint_UDA.csv", index=False, header=False)

In [72]:
# Use pseudo labeled cell lines and patient train data to train
X_train = torch.cat((train_features_tensor, cl_aug_train_features_tensor), dim=0)
X_train

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1051, -0.1956,  0.0713,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1925, -0.3728,  0.2480,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0713,  0.0008, -0.0503,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)

In [73]:
X_train.shape

torch.Size([227404, 9824])

In [74]:
y_train = torch.cat((train_y_tensor.view(-1, 1), cl_aug_train_y_pseudo), dim = 0)
y_train.shape

torch.Size([227404, 1])

In [81]:
nn_augmented_full = BaseLineNN()

In [82]:
criterion_aug = nn.BCEWithLogitsLoss()
optim_aug = torch.optim.Adam(nn_augmented_full.parameters(), lr = 1e-4)

In [83]:
# training 
for i in range(100):
    optim_aug.zero_grad()
    y_pred_aug = nn_augmented_full(X_train.to(dtype=torch.float32))
    loss_aug = criterion_aug(y_pred_aug, y_train.view(-1, 1).to(dtype=torch.float32))
    loss_aug.backward()
    optim_aug.step()

    with torch.no_grad():
        val_loss_aug = criterion_aug(nn_augmented_full(val_features_tensor.to(dtype=torch.float32)), val_y_tensor.view(-1, 1).to(dtype=torch.float32))

    print(f"Epoch {i}: Training loss: {loss_aug.detach().item()} |  Validation loss: {val_loss_aug.detach().item()}")

Epoch 0: Training loss: 0.6927016973495483 |  Validation loss: 0.6968560218811035
Epoch 1: Training loss: 0.6920368671417236 |  Validation loss: 0.6930705308914185
Epoch 2: Training loss: 0.691814661026001 |  Validation loss: 0.6897245645523071
Epoch 3: Training loss: 0.6918461918830872 |  Validation loss: 0.6870479583740234
Epoch 4: Training loss: 0.6919033527374268 |  Validation loss: 0.6850740313529968
Epoch 5: Training loss: 0.6918943524360657 |  Validation loss: 0.6836845278739929
Epoch 6: Training loss: 0.6918405890464783 |  Validation loss: 0.6827390789985657
Epoch 7: Training loss: 0.691782534122467 |  Validation loss: 0.6821140050888062
Epoch 8: Training loss: 0.6917439699172974 |  Validation loss: 0.6816973686218262
Epoch 9: Training loss: 0.6917290687561035 |  Validation loss: 0.681390106678009
Epoch 10: Training loss: 0.6917304396629333 |  Validation loss: 0.6811368465423584
Epoch 11: Training loss: 0.691737174987793 |  Validation loss: 0.6808949112892151
Epoch 12: Training

In [84]:
# inference
nn_augmented_full.eval()
y_test_pred = nn.Sigmoid()(nn_augmented_full(test_features_tensor.to(dtype=torch.float32)))
y_test_pred.shape

torch.Size([201, 1])

In [85]:
predictions = y_test_pred.detach().numpy().reshape(-1)
predictions.shape

(201,)

In [86]:
auroc = roc_auc_score(test_y, predictions)
auprc = average_precision_score(test_y, predictions)
print(f"AUROC: {auroc}, AUPRC: {auprc}")

AUROC: 0.6004715896020245, AUPRC: 0.7532748867625334


#### Augmented Cell Lines with AUDRC binarized

In [87]:
# convert existing labels for AUDRC into binary labels (1 being responder and 0 being non-responder)
# Divide based on median AUDRC per drug

In [28]:
cl_per_drug_median = cl_responses_df.set_index("depmap_id", drop=True).groupby(["drug_name"]).agg(np.median)

  cl_per_drug_median = cl_responses_df.set_index("depmap_id", drop=True).groupby(["drug_name"]).agg(np.median)


In [29]:
def categorise_audrc(x):
    median = cl_per_drug_median.loc[x["drug_name"]]["auc"]
    if x["auc"] < median:
        return 1
    else:
        return 0

In [30]:
cl_responses_df["binary_cat"] = cl_responses_df[["drug_name", "auc"]].apply(lambda x: categorise_audrc(x), axis=1)

In [31]:
cl_responses_df

Unnamed: 0,depmap_id,drug_name,auc,binary_cat
0,ACH-000001,JW-7-24-1,0.778432,0
1,ACH-000002,JW-7-24-1,0.788327,0
2,ACH-000004,JW-7-24-1,0.737630,0
3,ACH-000006,JW-7-24-1,0.176396,1
4,ACH-000007,JW-7-24-1,0.569751,1
...,...,...,...,...
226791,ACH-001702,SL 0101-1,0.958591,0
226792,ACH-001703,SL 0101-1,0.497201,1
226793,ACH-001711,SL 0101-1,0.815928,0
226794,ACH-001715,SL 0101-1,0.981441,0


In [32]:
cl_aug_train_features_labelled = []
cl_aug_train_y_labelled = []
for idx, row in cl_responses_df.iterrows():
    if row["drug_name"] in drug_fp.index:
        row_inp = []
        row_inp.extend(augmented_cl_df.loc[row["depmap_id"]].values)
        row_inp.extend(drug_fp.loc[row["drug_name"]].values)
        cl_aug_train_y_labelled.append(row["binary_cat"])
        cl_aug_train_features_labelled.append(row_inp)

In [33]:
cl_aug_train_features_labelled_tensor = torch.tensor(cl_aug_train_features_labelled)
cl_aug_train_y_labelled_tensor = torch.tensor(cl_aug_train_y_labelled)

In [34]:
# Use pseudo labeled cell lines and patient train data to train
X_train = torch.cat((train_features_tensor, cl_aug_train_features_labelled_tensor), dim = 0)
X_train

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.1051, -0.1956,  0.0713,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1925, -0.3728,  0.2480,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0713,  0.0008, -0.0503,  ...,  0.0000,  0.0000,  0.0000]],
       dtype=torch.float64)

In [35]:
X_train.shape

torch.Size([227404, 9824])

In [36]:
y_train = torch.cat((train_y_tensor, cl_aug_train_y_labelled_tensor), axis = 0)
y_train.shape

torch.Size([227404])

In [40]:
nn_augmented_full_labelled = BaseLineNN()

In [41]:
criterion_aug = nn.BCEWithLogitsLoss()
optim_aug = torch.optim.Adam(nn_augmented_full_labelled.parameters(), lr = 1e-4)

In [42]:
# training 
for i in range(100):
    optim_aug.zero_grad()
    y_pred_aug = nn_augmented_full_labelled(X_train.to(dtype=torch.float32))
    loss_aug = criterion_aug(y_pred_aug, y_train.view(-1, 1).to(dtype=torch.float32))
    loss_aug.backward()
    optim_aug.step()

    with torch.no_grad():
        val_loss_aug = criterion_aug(nn_augmented_full_labelled(val_features_tensor.to(dtype=torch.float32)), val_y_tensor.view(-1, 1).to(dtype=torch.float32))

    print(f"Epoch {i}: Training loss: {loss_aug.detach().item()} |  Validation loss: {val_loss_aug.detach().item()}")

Epoch 0: Training loss: 0.69393390417099 |  Validation loss: 0.6831134557723999
Epoch 1: Training loss: 0.6924335360527039 |  Validation loss: 0.6853835582733154
Epoch 2: Training loss: 0.6913213729858398 |  Validation loss: 0.6869404315948486
Epoch 3: Training loss: 0.6904622316360474 |  Validation loss: 0.6875239610671997
Epoch 4: Training loss: 0.689739465713501 |  Validation loss: 0.6873210072517395
Epoch 5: Training loss: 0.6890830993652344 |  Validation loss: 0.686587393283844
Epoch 6: Training loss: 0.6884605288505554 |  Validation loss: 0.6855635046958923
Epoch 7: Training loss: 0.6878677606582642 |  Validation loss: 0.6844591498374939
Epoch 8: Training loss: 0.6873098611831665 |  Validation loss: 0.6834331750869751
Epoch 9: Training loss: 0.6867886185646057 |  Validation loss: 0.6825655698776245
Epoch 10: Training loss: 0.6862990856170654 |  Validation loss: 0.6818968057632446
Epoch 11: Training loss: 0.685827374458313 |  Validation loss: 0.681475043296814
Epoch 12: Training l

In [43]:
# inference
nn_augmented_full_labelled.eval()
y_test_pred = nn.Sigmoid()(nn_augmented_full_labelled(test_features_tensor.to(dtype=torch.float32)))
y_test_pred.shape

torch.Size([201, 1])

In [44]:
predictions = y_test_pred.detach().numpy().reshape(-1)
predictions.shape

(201,)

In [45]:
auroc = roc_auc_score(test_y, predictions)
auprc = average_precision_score(test_y, predictions)
print(f"AUROC: {auroc}, AUPRC: {auprc}")

AUROC: 0.6295146077754773, AUPRC: 0.7756366349482967


#### Subset Selection over Weakly labelled cell lines - Memory issues!!!

In [19]:
import sys
sys.path.append("../src/")
from cutstat import get_cutstat_subset, get_cutstat_inds

In [20]:
import torch

In [46]:
cl_aug_train_features = np.load("/data/ajayago/druid/intermediate/cs6220/augmented_cl_features_rf_joint_UDA.npy")
cl_aug_train_features.shape

(226796, 9824)

In [53]:
cl_aug_train_y_pseudo_df = pd.read_csv("/data/ajayago/druid/intermediate/cs6220/augmented_cl_pseudolabels_rf_joint_UDA.csv", header=None)
cl_aug_train_y_pseudo = cl_aug_train_y_pseudo_df[0].values
cl_aug_train_y_pseudo.shape

(226796,)

In [28]:
device = torch.device("cuda:0")

In [29]:
features = torch.FloatTensor(cl_aug_train_features).to(device)
labels = torch.LongTensor(cl_aug_train_y_pseudo).to(device)

In [30]:
subset_inds = get_cutstat_inds(features, labels, coverage=0.8, K=20, device=device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 191.62 GiB. GPU 0 has a total capacity of 44.32 GiB of which 18.42 GiB is free. Process 3376797 has 490.00 MiB memory in use. Including non-PyTorch memory, this process has 25.41 GiB memory in use. Of the allocated memory 24.91 GiB is allocated by PyTorch, and 18.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)