In [1]:
# import libraries

import os
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import mygene
import sklearn.preprocessing as sk
import seaborn as sns
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve
import random
from random import randint
from sklearn.model_selection import StratifiedKFold
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
# define directories

cell_line_dir = "/Volumes/Expansion/Thesis Work/Supplementary Files/GDSC/"
models_dir = "/Volumes/Expansion/Thesis Work/Results/Transfer Learning/Drugs with same pathways/TCGA_DNA_REPi/Expression_CNA/Models/"
DEGs_dir = "/Volumes/Expansion/Thesis Work/Results/GDSC_DEGs_inhibitors/DNA_REPi/"
dataset_dir = "/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/"
save_results_to = "/Volumes/Expansion/Thesis Work/Results/Transfer Learning/Drugs with same pathways/TCGA_DNA_REPi/Expression_CNA/Predictions/"

In [3]:
# set random seeds

torch.manual_seed(42)
random.seed(42)

In [4]:
# change directory to read GDSC cell line details

os.chdir(cell_line_dir)
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Supplementary Files/GDSC


In [5]:
# read cell line details table in GDSC

GDSC_cell_line_details = pd.read_excel("GDSC_Cell_Lines_Details.xlsx", keep_default_na = False)
GDSC_cell_line_details.set_index("COSMIC identifier", inplace = True)
GDSC_cell_line_details.tail()

Unnamed: 0_level_0,Sample Name,Whole Exome Sequencing (WES),Copy Number Alterations (CNA),Gene Expression,Methylation,Drug\nResponse,GDSC\nTissue descriptor 1,GDSC\nTissue\ndescriptor 2,Cancer Type\n(matching TCGA label),Microsatellite \ninstability Status (MSI),Screen Medium,Growth Properties
COSMIC identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1298218.0,KP-2,Y,N,Y,N,Y,pancreas,pancreas,PAAD,,D/F12,Adherent
1330932.0,KO52,Y,Y,N,N,N,leukemia,acute_myeloid_leukaemia,LAML,MSS/MSI-L,D/F12,
1331030.0,SC-1,Y,Y,N,N,N,lymphoma,B_cell_lymphoma,DLBC,MSS/MSI-L,R,
1503373.0,U-CH2,Y,Y,N,N,N,bone,bone_other,,MSS/MSI-L,D/F12,Adherent
,TOTAL:,1001,996,968,957,990,,,,,,


In [6]:
# change directory to read GDSC expression dataset (DNA_REPi)

os.chdir(dataset_dir + "/exprs/")
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/exprs


In [7]:
# read GDSC expression dataset (DNA_REPi)

GDSCE = pd.read_csv("GDSC_exprs.z.DNA_REPi.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
GDSCE = pd.DataFrame.transpose(GDSCE)
GDSCE.head(3)

ENTREZID,1,2,9,10,12,13,14,15,16,18,...,107984199,107984208,107984325,107984411,107984648,107984889,107984989,107986313,107986782,107986810
683665,-0.4711563767951986,-0.1857522957766964,0.9702409548662462,-0.4081281805960821,-0.4161511169999803,-0.4384159723467665,-1.154692246005183,0.5262132484107975,-0.7950924576033422,-0.6337442716864214,...,1.283534690590172,-0.3367265464586691,-0.0039762988723447,0.7378729872604209,0.9757614264575688,0.1494417647095801,2.062294063911948,-0.8140636761719959,0.2870333828833719,0.9524265085537148
684055,1.3565140948247052,-0.277542411913307,0.0887087882196536,-0.2754829982302413,-0.5153856346587746,-0.6248712076433911,0.1243408990866293,-1.2699667684674136,2.078836088638272,2.51979919974592,...,-0.5571028630154293,-1.724090733574364,-0.3749080365068901,-0.0016055431935905,-0.3788535663547365,-1.3502092798315848,0.6949790577550328,0.4333032198982747,-0.1280986270308099,-1.629249958712223
684057,0.6510004960254864,1.6578876382433665,-0.4977106390881842,-0.1212172092933644,-0.398455281697677,-0.2804901106672752,0.6455898116420014,0.215833091774836,-0.1723595788875009,-0.2170870770552223,...,-0.024863133395904,0.4073439017665206,-0.4440356638831038,0.8630347078663075,1.4486171428489227,-1.4777214830771732,0.5220400480464272,0.6441211958947753,0.9057518606131394,0.3018783753479829


In [8]:
# change directory to read GDSC CNA dataset (DNA_REPi)

os.chdir(dataset_dir + "/CNA/")
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/CNA


In [9]:
# read GDSC CNA dataset (DNA_REPi)

GDSCC = pd.read_csv("GDSC_CNA.DNA_REPi.tsv",
                    sep="\t", index_col=0, decimal=".")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)
GDSCC = GDSCC.loc[:, ~GDSCC.columns.duplicated()]

GDSCC.head()

gene_id,1,2,9,10,12,13,14,15,16,18,...,107985535,107985759,107986588,107986809,107986898,107987337,107987341,109731405,112441434,121676927
683665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.807355,0.0,0.0,0.0
684055,0.0,0.584963,0.584963,0.584963,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.584963,0.0,0.584963,0.584963,-4.32,-4.32,0.0,0.0,0.584963
684057,-0.415037,0.0,0.584963,0.584963,-1.0,0.0,0.0,0.0,-0.415037,-0.415037,...,-0.415037,0.0,0.0,0.0,0.807355,-1.0,-1.0,-0.415037,-0.415037,0.807355
684059,0.0,0.0,0.0,0.0,0.0,0.415037,0.0,0.0,0.0,0.415037,...,0.0,0.0,0.415037,0.0,0.415037,-4.32,-4.32,0.0,0.0,0.736966
684062,-0.415037,0.0,0.321928,0.321928,-0.415037,-0.415037,0.0,0.0,-0.415037,0.0,...,-0.415037,0.0,0.0,0.0,0.321928,-4.32,-4.32,-0.415037,-1.0,0.321928


In [10]:
# change directory to read GDSC response dataset (DNA_REPi)

os.chdir(dataset_dir + "/response/")
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/response


In [11]:
# read GDSC response dataset (DNA_REPi)

GDSCR = pd.read_csv("GDSC_response.DNA_REPi.tsv",
                    sep="\t",
                    index_col=0,
                    decimal=",")
GDSCR.dropna(inplace=True)
GDSCR.rename(mapper=str, axis='index', inplace=True)
d = {"R": 0, "S": 1}
GDSCR["response"] = GDSCR.loc[:, "response"].apply(lambda x: d[x])

GDSCR.head()

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,0,5.12075503641523,Pyrimethamine,1,1,1
683665,0,-3.03711635474651,Doxorubicin,1,1,1
683665,0,-0.712119156515401,Etoposide,1,1,1
683665,0,-4.40897202584936,Gemcitabine,1,1,1
683665,0,-1.59752443691324,Mitomycin C,1,1,1


In [12]:
# print count of cell lines for each drug (DNA_REPi)

GDSCR["drug"].value_counts()

drug
Etoposide         775
SN-38             771
Mitomycin C       767
Doxorubicin       766
5-Fluorouracil    764
Gemcitabine       761
Bleomycin         759
Temozolomide      752
Cisplatin         738
Methotrexate      738
Camptothecin      737
Cytarabine        735
Pyrimethamine     372
Name: count, dtype: int64

In [13]:
# change directory to read TCGA expression dataset homogenized with GDSC expression dataset (5-Fluorouracil)

os.chdir(dataset_dir + "/exprs_homogenized/")
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/exprs_homogenized


In [14]:
# read TCGA expression dataset homogenized with GDSC expression dataset (5-Fluorouracil)

TCGAEfluorouracil = pd.read_csv("TCGA_exprs.Fluorouracil.eb_with.GDSC_exprs.5-Fluorouracil.tsv",
                   sep = "\t", index_col=0, decimal = ",")
TCGAEfluorouracil = pd.DataFrame.transpose(TCGAEfluorouracil)
TCGAEfluorouracil.head(3)

ENTREZID,1,2,9,10,12,13,14,15,16,18,...,100188893,100190940,100190949,100191040,100192386,100268168,100271715,100287718,100288778,100289635
TCGA-3A-A9IC,5.72964960061116,4.41876362025309,4.86023990366878,2.47277876318537,4.81396331677419,2.6727941627861,8.72268461889477,2.5342747028707,8.30356599972612,2.66680810277045,...,9.61327758421809,2.69169602849574,2.74156299885291,3.14358416474115,3.63806017988108,2.83936659460498,2.84468471240513,2.9097524321588,4.74904572228446,4.63830114019886
TCGA-3A-A9IX,5.16820185705619,5.0656506669595,5.05015666335564,2.49748103264488,6.24486748738657,3.89184369807008,8.07246325142995,2.65471072107481,8.69196063245573,3.97461007293206,...,9.28039145658571,2.68523221938676,2.70770230320119,3.1831168236566,3.17601340812919,2.66266337139924,3.05740520413346,2.9097524321588,4.66674620630048,4.95684240309785
TCGA-AG-3593,4.09489404413517,3.56187701617794,5.74195136365074,3.52505514866919,0.688407410705727,2.04131612004455,8.50563761093224,2.83121226059543,8.98277420895832,4.7862922506412,...,9.44820752386781,3.3077275660044,3.04450667890368,3.53272687811095,2.90679846697165,2.45941479590968,2.65824917267036,2.9097524321588,5.80963066021836,4.86261821976041


In [15]:
# change directory to read TCGA CNA dataset (5-Fluorouracil)

os.chdir(dataset_dir + "CNA")
os.getcwd()

'/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/CNA'

In [16]:
# read TCGA CNA dataset (5-Fluorouracil)

TCGACfluorouracil = pd.read_csv("TCGA_CNA.Fluorouracil.tsv",
                        sep="\t", index_col=0, decimal=",")
TCGACfluorouracil.drop_duplicates(keep='last')
TCGACfluorouracil = pd.DataFrame.transpose(TCGACfluorouracil)
TCGACfluorouracil = TCGACfluorouracil.loc[:, ~TCGACfluorouracil.columns.duplicated()]

TCGACfluorouracil.head(3)

Unnamed: 0,9,10,11,24,27,29,32,34,35,36,...,728851,100130391,100290877,100310869,100420569,100421288,100422928,100506036,100506076,100506123
TCGA-3A-A9IC,-0.2644,-0.2644,-0.2644,0.0,0.3214,-0.2518,-0.2497,0.0,-0.2497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-3A-A9IX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-AG-3593,-0.7299,-0.7299,-0.7299,-0.3147,0.4133,-0.7013,-0.294,-0.3147,-0.3051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# variance threshold for GDSC Expression dataset (DNA_REPi)

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

In [18]:
# fill NA values and binarize GDSC CNA dataset (DNA_REPi)

GDSCC = GDSCC.fillna(0)
GDSCC[GDSCC != 0.0] = 1

In [19]:
# select shared genes between GDSC and TCGA expression and CNA datasets 

ls = GDSCE.columns.intersection(GDSCC.columns)
ls = ls.intersection(TCGAEfluorouracil.columns)
ls = ls.intersection(TCGACfluorouracil.columns)

ls = pd.unique(ls)

print(f"GDSC and TCGA # of common genes: {ls.shape[0]}")

GDSC and TCGA # of common genes: 16389


In [20]:
# select shared samples between GDSC expression and response datasets (DNA_REPi)

ls2 = GDSCE.index.intersection(GDSCC.index)
ls2 = ls2.intersection(GDSCR.index)

print(f"GDSC # of common cell lines: {ls2.shape[0]}")

GDSC # of common cell lines: 801


In [21]:
# subset shared genes and samples in GDSC expression, CNA, and response datasets

GDSCE = GDSCE.loc[ls2, ls]
GDSCC = GDSCC.loc[ls2, ls]
GDSCR = GDSCR.loc[ls2, :]

print(f"GDSC # of common samples and genes (Expression): {GDSCE.shape}")
print(f"GDSC # of common samples and genes (CNA): {GDSCC.shape}")
print(f"GDSC # of samples (Response): {GDSCR.shape[0]}")

GDSC # of common samples and genes (Expression): (801, 16389)
GDSC # of common samples and genes (CNA): (801, 16389)
GDSC # of samples (Response): 9435


In [22]:
# select shared samples between TCGA expression and CNA datasets (5-Fluorouracil) 

ls3 = TCGAEfluorouracil.index.intersection(TCGACfluorouracil.index)

print(f"TCGA # of common samples (5-Fluorouracil): {ls3.shape[0]}")

TCGA # of common samples (5-Fluorouracil): 33


In [23]:
# select shared genes and samples between TCGA expression and CNA datasets (5-Fluorouracil)

TCGAEfluorouracil = TCGAEfluorouracil.loc[ls3, ls]
TCGACfluorouracil = TCGACfluorouracil.loc[ls3, ls]

print(f"TCGA # of common samples and genes for Expression (5-Fluorouracil): {TCGAEfluorouracil.shape}")
print(f"TCGA # of common samples and genes for CNA (5-Fluorouracil): {TCGACfluorouracil.shape}\n")

TCGA # of common samples and genes for Expression (5-Fluorouracil): (33, 16389)
TCGA # of common samples and genes for CNA (5-Fluorouracil): (33, 16389)



In [24]:
# change directory to read DEGs (DNA_REPi)

os.chdir(DEGs_dir)
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/GDSC_DEGs_inhibitors/DNA_REPi


In [25]:
# read DEGs (DNA_REPi)

DEGs_filtered_data = pd.read_excel("DNA_REPi_Differentially_Expressed_Genes (EnsemblID).xlsx",
                                    sheet_name = "Common DEGs")
print(f"There are {DEGs_filtered_data.shape[0]} DEGs (p-adjusted < 0.05 and |logFC| > 1) among DNA replication inhibitors\n")

DEGs_filtered_data.head()

There are 858 DEGs (p-adjusted < 0.05 and |logFC| > 1) among DNA replication inhibitors



Unnamed: 0,Gene Symbol,Frequency
0,SLFN11,9
1,NCKAP1L,8
2,SDC4,7
3,KRT18,7
4,S100A14,7


In [26]:
# get gene symbol from mygene database

mg = mygene.MyGeneInfo()
DEGs_entrez_id = mg.querymany(DEGs_filtered_data["Gene Symbol"], 
                              species="human", 
                              scopes = "symbol",
                              field = "entrezgene",
                              as_dataframe=True)["entrezgene"]
DEGs_entrez_id = pd.unique(DEGs_entrez_id.dropna())
DEGs_entrez_id.shape[0]

1 input query terms found dup hits:	[('CLECL1P', 2)]


858

In [27]:
# assign GDSC datasets to new variables

exprs_z = GDSCE
cna = GDSCC
responses = GDSCR

In [28]:
# GDSC response dataset

responses.head(3)

Unnamed: 0,response,logIC50,drug,exprs,CNA,mutations
683665,0,5.12075503641523,Pyrimethamine,1,1,1
683665,0,-3.03711635474651,Doxorubicin,1,1,1
683665,0,-0.712119156515401,Etoposide,1,1,1


In [29]:
# list drug names in GDSC response dataset

drugs = set(responses["drug"].values)
drugs

{'5-Fluorouracil',
 'Bleomycin',
 'Camptothecin',
 'Cisplatin',
 'Cytarabine',
 'Doxorubicin',
 'Etoposide',
 'Gemcitabine',
 'Methotrexate',
 'Mitomycin C',
 'Pyrimethamine',
 'SN-38',
 'Temozolomide'}

In [30]:
# convert Entrez gene IDs to integer

responses.index = responses.index.astype(int)
responses.index

Index([ 683665,  683665,  683665,  683665,  683665,  683665,  683665,  683665,
        683665,  683665,
       ...
       1659818, 1659818, 1659818, 1659818, 1659819, 1659819, 1659819, 1659819,
       1659819,  687800],
      dtype='int64', length=9435)

In [31]:
# merge GDSC cell line details with response dataset

dat = pd.merge(GDSC_cell_line_details,
         responses,
         left_index = True, 
         right_index=True,
         how = "inner")

dat["GDSC\nTissue descriptor 1"].value_counts()
#dat.head()

GDSC\nTissue descriptor 1
lung_NSCLC           1037
urogenital_system     926
leukemia              829
lymphoma              769
aero_dig_tract        727
nervous_system        607
lung_SCLC             569
skin                  560
breast                524
digestive_system      466
large_intestine       447
bone                  421
neuroblastoma         332
pancreas              294
kidney                227
soft_tissue           191
myeloma               182
lung                  174
thyroid               153
Name: count, dtype: int64

In [32]:
# list count of cell lines as to tissues treated with 5-Fluorouracil

filter = (dat["drug"] == "5-Fluorouracil")
dat.loc[filter,"GDSC\nTissue descriptor 1"].value_counts()

GDSC\nTissue descriptor 1
lung_NSCLC           86
urogenital_system    78
leukemia             63
aero_dig_tract       59
lymphoma             57
nervous_system       50
skin                 47
lung_SCLC            46
breast               44
large_intestine      38
digestive_system     37
bone                 34
neuroblastoma        28
pancreas             25
kidney               18
lung                 15
soft_tissue          14
myeloma              13
thyroid              12
Name: count, dtype: int64

In [33]:
# convert Entrez gene IDs to string

responses.index = responses.index.astype(str)
responses.index

Index(['683665', '683665', '683665', '683665', '683665', '683665', '683665',
       '683665', '683665', '683665',
       ...
       '1659818', '1659818', '1659818', '1659818', '1659819', '1659819',
       '1659819', '1659819', '1659819', '687800'],
      dtype='object', length=9435)

In [34]:
# filter GDSC expression and CNA datasets (DNA_REPi) as to drugs
# subset selected DEGs

expression_zscores = []
CNA = []
for drug in drugs:
    samples = responses.loc[responses["drug"] == drug, :].index.values
    e_z = exprs_z.loc[samples, :]
    c = cna.loc[samples, :]
    expression_zscores.append(e_z)
    CNA.append(c)

GDSCEv2 = pd.concat(expression_zscores, axis=0)
GDSCCv2 = pd.concat(CNA, axis=0)
GDSCRv2 = responses

ls4 = list(set(GDSCE.columns).intersection(set(DEGs_entrez_id.astype(int))))
ls5 = GDSCEv2.index.intersection(GDSCCv2.index)

GDSCEv2 = GDSCEv2.loc[ls5, ls4]
GDSCCv2 = GDSCCv2.loc[ls5, ls4]
GDSCRv2 = GDSCRv2.loc[ls5, :]

TCGAEfluorouracil = TCGAEfluorouracil.loc[:,ls4]
TCGACfluorouracil = TCGACfluorouracil.loc[:,ls4]

responses.index = responses.index.values + "_" + responses["drug"].values

print(f"GDSC # of common samples and genes (Expression): {GDSCEv2.shape}")
print(f"GDSC # of common samples and genes (CNA): {GDSCCv2.shape}")
print(f"GDSC # of common samples (Response): {GDSCRv2.shape[0]}\n")

print(f"TCGA # of common samples and genes for 5-Fluorouracil (Expression): {TCGAEfluorouracil.shape}")
print(f"TCGA # of common samples and genes for 5-Fluorouracil (CNA): {TCGACfluorouracil.shape}\n")

GDSC # of common samples and genes (Expression): (9435, 772)
GDSC # of common samples and genes (CNA): (9435, 772)
GDSC # of common samples (Response): 9435

TCGA # of common samples and genes for 5-Fluorouracil (Expression): (33, 772)
TCGA # of common samples and genes for 5-Fluorouracil (CNA): (33, 772)



In [35]:
# GDSC response values

Y = GDSCRv2['response'].values
print(Y)

[0 0 0 ... 0 0 1]


In [36]:
# change directory to read TCGA response dataset (5-Fluorouracil)

os.chdir(dataset_dir + "response")
os.getcwd()

'/Volumes/Expansion/Thesis Work/Results/preprocessed_results2/response'

In [37]:
# read TCGA response dataset (5-Fluorouracil)

TCGARfluorouracil = pd.read_csv("TCGA_response.Fluorouracil.tsv", 
                      sep = "\t", 
                      index_col=0, 
                      decimal = ",")
TCGARfluorouracil.dropna(inplace=True)
TCGARfluorouracil.rename(mapper=str, axis='index', inplace=True)
d = {"R": 0, "S": 1}
TCGARfluorouracil["response"] = TCGARfluorouracil.loc[:, "response"].apply(lambda x: d[x])
Ytsfluorouracil = TCGARfluorouracil["response"].values    

print(f"There are {Ytsfluorouracil.shape[0]} samples with response data in the TCGA (5-Fluorouracil)\n")
Ytsfluorouracil

There are 33 samples with response data in the TCGA (5-Fluorouracil)



array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1])

In [38]:
# TCGA cohort table with TCGA response dataset (5-Fluorouracil)

dat_tcga = TCGARfluorouracil["cohort"]

dat_tcga.value_counts()

cohort
STAD    18
READ     7
PAAD     6
ESCA     2
Name: count, dtype: int64

In [39]:
# filter STAD cohort samples (5-Fluorouracil)

filter = (TCGARfluorouracil["cohort"] == "STAD")
tcga_stad_samples = dat_tcga.loc[filter,].index
tcga_stad_samples

Index(['TCGA-CG-4444', 'TCGA-D7-8579', 'TCGA-D7-A748', 'TCGA-F1-A448',
       'TCGA-FP-7829', 'TCGA-FP-8209', 'TCGA-HU-8243', 'TCGA-KB-A6F7',
       'TCGA-VQ-A8DT', 'TCGA-VQ-A8DU', 'TCGA-VQ-A8DZ', 'TCGA-VQ-A8E2',
       'TCGA-VQ-A8P3', 'TCGA-VQ-A91Q', 'TCGA-VQ-AA68', 'TCGA-VQ-AA6B',
       'TCGA-VQ-AA6F', 'TCGA-VQ-AA6G'],
      dtype='object')

In [40]:
# filter responses of STAD cohort samples (5-Fluorouracil)

Ytsfluorouracil_stad = TCGARfluorouracil.loc[tcga_stad_samples,"response"].values
Ytsfluorouracil_stad 

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0])

In [41]:
# count of sensitive and resistance STAD samples (5-Fluorouracil)

TCGARfluorouracil.loc[tcga_stad_samples,"response"].value_counts()

response
1    13
0     5
Name: count, dtype: int64

In [42]:
# filter PAAD cohort samples (5-Fluorouracil)

filter = (TCGARfluorouracil["cohort"] == "PAAD")
tcga_paad_samples = dat_tcga.loc[filter,].index
tcga_paad_samples

Index(['TCGA-3A-A9IC', 'TCGA-3A-A9IX', 'TCGA-HZ-7920', 'TCGA-HZ-A49I',
       'TCGA-IB-7888', 'TCGA-OE-A75W'],
      dtype='object')

In [43]:
# filter responses of PAAD cohort samples (5-Fluorouracil)

Ytsfluorouracil_paad  = TCGARfluorouracil.loc[tcga_paad_samples,"response"].values
Ytsfluorouracil_paad 

array([0, 1, 1, 0, 0, 0])

In [44]:
# count of sensitive and resistance PAAD samples (5-Fluorouracil)

TCGARfluorouracil.loc[tcga_paad_samples,"response"].value_counts()

response
0    4
1    2
Name: count, dtype: int64

In [45]:
# define maximum iteration

max_iter = 50

In [46]:
# assign number of neurons and dropout rates found for layers from the saved models

hdm1 = 64
hdm2 = 32
rate1 = 0.5
rate2 = 0.6
rate3 = 0.6

In [54]:
# load pre-trained models and make predictions

scalerGDSC = sk.StandardScaler()
scalerGDSC.fit(GDSCEv2.values)
X_trainE = scalerGDSC.transform(GDSCEv2.values)
X_testEfluorouracil = scalerGDSC.transform(TCGAEfluorouracil.values)    

X_testEfluorouracil = scalerGDSC.transform(TCGAEfluorouracil.values)    
X_testEfluorouracil_stad = scalerGDSC.transform(TCGAEfluorouracil.loc[tcga_stad_samples,].values)  
X_testEfluorouracil_paad = scalerGDSC.transform(TCGAEfluorouracil.loc[tcga_paad_samples,].values)    

X_trainC = np.nan_to_num(GDSCCv2.values)
X_testCfluorouracil = np.nan_to_num(TCGACfluorouracil.values.astype(np.float32))
X_testCfluorouracil = np.nan_to_num(TCGACfluorouracil.values)
X_testCfluorouracil_stad = np.nan_to_num(TCGACfluorouracil.loc[tcga_stad_samples,].values)
X_testCfluorouracil_paad = np.nan_to_num(TCGACfluorouracil.loc[tcga_paad_samples,].values)

TX_testEfluorouracil = torch.FloatTensor(X_testEfluorouracil)
TX_testCfluorouracil = torch.FloatTensor(X_testCfluorouracil.astype(np.float32))
ty_testEfluorouracilorouracil = torch.FloatTensor(Ytsfluorouracil.astype(int))

TX_testEfluorouracil = torch.FloatTensor(X_testEfluorouracil)
TX_testEfluorouracil_stad = torch.FloatTensor(X_testEfluorouracil_stad)
TX_testEfluorouracil_paad = torch.FloatTensor(X_testEfluorouracil_paad)

TX_testCfluorouracil = torch.FloatTensor(X_testCfluorouracil.astype(np.float32))
TX_testCfluorouracil_stad = torch.FloatTensor(X_testCfluorouracil_stad.astype(np.float32))
TX_testCfluorouracil_paad = torch.FloatTensor(X_testCfluorouracil_paad.astype(np.float32))

n_sampE, IE_dim = X_trainE.shape
n_sampC, IC_dim = X_trainC.shape

h_dim1 = hdm1
h_dim2 = hdm2
Z_in = h_dim1 + h_dim2 

class AEE(nn.Module):
    def __init__(self):
        super(AEE, self).__init__()
        self.EnE = torch.nn.Sequential(
            nn.Linear(IE_dim, h_dim1),
            nn.BatchNorm1d(h_dim1),
            nn.ReLU(),
            nn.Dropout(rate1))
    def forward(self, x):
        output = self.EnE(x)
        return output  


class AEC(nn.Module):
    def __init__(self):
        super(AEC, self).__init__()
        self.EnC = torch.nn.Sequential(
            nn.Linear(IC_dim, h_dim2),
            nn.BatchNorm1d(h_dim2),
            nn.ReLU(),
            nn.Dropout(rate2))
    def forward(self, x):
        output = self.EnC(x)
        return output       

class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.FC = torch.nn.Sequential(
            nn.Linear(Z_in, 1),
            nn.Dropout(rate3),
            nn.Sigmoid())
    def forward(self, x):
        return self.FC(x)

torch.cuda.manual_seed_all(42)

AutoencoderE = torch.load(models_dir + "Exprs_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA.pt")
AutoencoderC = torch.load(models_dir + "CNA_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA.pt")

Clas = torch.load(models_dir + "Class_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA.pt")

AutoencoderE.eval()
AutoencoderC.eval()
Clas.eval()

ZEX = AutoencoderE(torch.FloatTensor(X_trainE))
ZCX = AutoencoderC(torch.FloatTensor(X_trainC))
ZTX = torch.cat((ZEX, ZCX), 1)
ZTX = F.normalize(ZTX, p=2, dim=0)
PredX = Clas(ZTX)
AUCT = roc_auc_score(Y, PredX.detach().numpy())
#print(f"GDSC - Training AUC (TCGA_DNA_REPi): {round(AUCT,2)}")
AUCTaucpr = average_precision_score(Y, PredX.detach().numpy())
#print(f"GDSC - Training AUCPR (TCGA_DNA_REPi): {round(AUCTaucpr,2)}\n")

ZETfluorouracil = AutoencoderE(TX_testEfluorouracil)
ZCTfluorouracil = AutoencoderC(TX_testCfluorouracil)
ZTTfluorouracil = torch.cat((ZETfluorouracil, ZCTfluorouracil), 1)
ZTTfluorouracil = F.normalize(ZTTfluorouracil, p=2, dim=0)
PredTfluorouracil = Clas(ZTTfluorouracil)
AUCTfluorouracil = roc_auc_score(Ytsfluorouracil.astype(int), PredTfluorouracil.detach().numpy())
#print(f"TCGA - Test AUC (5-Fluorouracil): {round(AUCTfluorouracil,2)}")
AUCTfluorouracil_aucpr = average_precision_score(Ytsfluorouracil.astype(int), PredTfluorouracil.detach().numpy())
#print(f"TCGA - Test AUCPR (5-Fluorouracil): {round(AUCTfluorouracil_aucpr,2)}\n")

ZETfluorouracil_stad = AutoencoderE(TX_testEfluorouracil_stad )
ZCTfluorouracil_stad = AutoencoderC(TX_testCfluorouracil_stad )
ZTTfluorouracil_stad = torch.cat((ZETfluorouracil_stad , ZCTfluorouracil_stad ), 1)
ZTTfluorouracil_stad = F.normalize(ZTTfluorouracil_stad , p=2, dim=0)
PredTfluorouracil_stad = Clas(ZTTfluorouracil_stad )
AUCTfluorouracil_stad = roc_auc_score(Ytsfluorouracil_stad .astype(int), PredTfluorouracil_stad .detach().numpy())
#print(f"TCGA - Test AUC (5-Fluorouracil, STAD): {round(AUCTfluorouracil_stad ,2)}")
AUCTfluorouracil_stad_aucpr = average_precision_score(Ytsfluorouracil_stad .astype(int), PredTfluorouracil_stad .detach().numpy())
#print(f"TCGA - Test AUCPR (5-Fluorouracil, STAD): {round(AUCTfluorouracil_stad_aucpr,2)}\n")

ZETfluorouracil_paad  = AutoencoderE(TX_testEfluorouracil_paad )
ZCTfluorouracil_paad  = AutoencoderC(TX_testCfluorouracil_paad )
ZTTfluorouracil_paad  = torch.cat((ZETfluorouracil_paad , ZCTfluorouracil_paad ), 1)
ZTTfluorouracil_paad  = F.normalize(ZTTfluorouracil_paad , p=2, dim=0)
PredTfluorouracil_paad  = Clas(ZTTfluorouracil_paad )
AUCTfluorouracil_paad  = roc_auc_score(Ytsfluorouracil_paad .astype(int), PredTfluorouracil_paad .detach().numpy())
#print(f"TCGA - Test AUC (5-Fluorouracil, PAAD): {round(AUCTfluorouracil_paad ,2)}")
AUCTfluorouracil_paad_aucpr = average_precision_score(Ytsfluorouracil_paad .astype(int), PredTfluorouracil_paad .detach().numpy())
#print(f"TCGA - Test AUCPR (5-Fluorouracil, PAAD): {round(AUCTfluorouracil_paad_aucpr,2)}")

AUCT_finetuned = dict()
AUCTaucpr_finetuned = dict()
AUCTfluorouracil_finetuned = dict()
AUCTfluorouracil_stad_finetuned = dict()
AUCTfluorouracil_paad_finetuned = dict()
AUCTfluorouracil_aucpr_finetuned = dict()
AUCTfluorouracil_stad_aucpr_finetuned = dict()
AUCTfluorouracil_paad_aucpr_finetuned = dict()
for iter in range(max_iter):
   iter += 1  

   AutoencoderE_finetuned = torch.load(models_dir + f"Finetuned_Models/Exprs_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{iter}.pt")
   AutoencoderC_finetuned = torch.load(models_dir + f"Finetuned_Models/CNA_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{iter}.pt")
   Clas_finetuned = torch.load(models_dir + f"Finetuned_Models/Class_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{iter}.pt")
   
   AutoencoderE_finetuned.eval()
   AutoencoderC_finetuned.eval()
   Clas_finetuned.eval()
      
   ZEX = AutoencoderE_finetuned(torch.FloatTensor(X_trainE))
   ZCX = AutoencoderC_finetuned(torch.FloatTensor(X_trainC))
   ZTX = torch.cat((ZEX, ZCX), 1)
   ZTX = F.normalize(ZTX, p=2, dim=0)
   PredX = Clas_finetuned(ZTX)
   AUCT_finetuned[iter] = roc_auc_score(Y, PredX.detach().numpy())
   AUCTaucpr_finetuned[iter] = average_precision_score(Y, PredX.detach().numpy())
   
   ZETfluorouracil = AutoencoderE_finetuned(TX_testEfluorouracil)
   ZCTfluorouracil = AutoencoderC_finetuned(TX_testCfluorouracil)
   ZTTfluorouracil = torch.cat((ZETfluorouracil, ZCTfluorouracil), 1)
   ZTTfluorouracil = F.normalize(ZTTfluorouracil, p=2, dim=0)
   PredTfluorouracil = Clas_finetuned(ZTTfluorouracil)
   AUCTfluorouracil_finetuned[iter] = roc_auc_score(Ytsfluorouracil.astype(int), PredTfluorouracil.detach().numpy())
   AUCTfluorouracil_aucpr_finetuned[iter] = average_precision_score(Ytsfluorouracil.astype(int), PredTfluorouracil.detach().numpy())

   ZETfluorouracil_stad = AutoencoderE_finetuned(TX_testEfluorouracil_stad)
   ZCTfluorouracil_stad = AutoencoderC_finetuned(TX_testCfluorouracil_stad)
   ZTTfluorouracil_stad = torch.cat((ZETfluorouracil_stad, ZCTfluorouracil_stad), 1)
   ZTTfluorouracil_stad = F.normalize(ZTTfluorouracil_stad, p=2, dim=0)
   PredTfluorouracil_stad = Clas_finetuned(ZTTfluorouracil_stad)
   AUCTfluorouracil_stad_finetuned[iter] = roc_auc_score(Ytsfluorouracil_stad.astype(int), PredTfluorouracil_stad.detach().numpy())
   AUCTfluorouracil_stad_aucpr_finetuned[iter] = average_precision_score(Ytsfluorouracil_stad.astype(int), PredTfluorouracil_stad.detach().numpy())

   ZETfluorouracil_paad = AutoencoderE_finetuned(TX_testEfluorouracil_paad)
   ZCTfluorouracil_paad = AutoencoderC_finetuned(TX_testCfluorouracil_paad)
   ZTTfluorouracil_paad = torch.cat((ZETfluorouracil_paad, ZCTfluorouracil_paad), 1)
   ZTTfluorouracil_paad = F.normalize(ZTTfluorouracil_paad, p=2, dim=0)
   PredTfluorouracil_paad = Clas_finetuned(ZTTfluorouracil_paad)
   AUCTfluorouracil_paad_finetuned[iter] = roc_auc_score(Ytsfluorouracil_paad.astype(int), PredTfluorouracil_paad.detach().numpy())
   AUCTfluorouracil_paad_aucpr_finetuned[iter] = average_precision_score(Ytsfluorouracil_paad.astype(int), PredTfluorouracil_paad.detach().numpy())

max_key = max(AUCTfluorouracil_aucpr_finetuned, key=AUCTfluorouracil_aucpr_finetuned.get)
max_key_stad = max(AUCTfluorouracil_stad_aucpr_finetuned, key=AUCTfluorouracil_stad_aucpr_finetuned.get)
max_key_paad = max(AUCTfluorouracil_paad_aucpr_finetuned, key=AUCTfluorouracil_paad_aucpr_finetuned.get)

print(f"GDSC - Training AUC (DNA_REPi): {round(AUCT,2)}")
print(f"GDSC - Training AUCPR (DNA_REPi): {round(AUCTaucpr,2)}")
print(f"TCGA - Test AUC (Fluorouracil): {round(AUCTfluorouracil,2)}")
print(f"TCGA - Test AUCPR (Fluorouracil): {round(AUCTfluorouracil_aucpr,2)}")
print(f"TCGA - Test AUC (Fluorouracil, STAD): {round(AUCTfluorouracil_stad,2)}")
print(f"TCGA - Test AUCPR (Fluorouracil, STAD): {round(AUCTfluorouracil_stad_aucpr,2)}")
print(f"TCGA - Test AUC (Fluorouracil, PAAD): {round(AUCTfluorouracil_paad,2)}")
print(f"TCGA - Test AUCPR (Fluorouracil, PAAD): {round(AUCTfluorouracil_paad_aucpr,2)}\n")

print(f"GDSC - Training AUC (DNA_REPi, Finetuned): {round(AUCT_finetuned[max_key],2)}")
print(f"GDSC - Training AUCPR (DNA_REPi, Finetuned): {round(AUCTaucpr_finetuned[max_key],2)}")
print(f"TCGA - Test AUC (Fluorouracil, Finetuned): {round(AUCTfluorouracil_finetuned[max_key],2)}")
print(f"TCGA - Test AUCPR (Fluorouracil, Finetuned): {round(AUCTfluorouracil_aucpr_finetuned[max_key],2)}\n")

print(f"GDSC - Training AUC (DNA_REPi, Finetuned): {round(AUCT_finetuned[max_key_stad],2)}")
print(f"GDSC - Training AUCPR (DNA_REPi, Finetuned): {round(AUCTaucpr_finetuned[max_key_stad],2)}")
print(f"TCGA - Test AUC (Fluorouracil, STAD, Finetuned): {round(AUCTfluorouracil_stad_finetuned[max_key_stad],2)}")
print(f"TCGA - Test AUCPR (Fluorouracil, STAD, Finetuned): {round(AUCTfluorouracil_stad_aucpr_finetuned[max_key_stad],2)}\n\n")

print(f"GDSC - Training AUC (DNA_REPi, Finetuned): {round(AUCT_finetuned[max_key_paad],2)}")
print(f"GDSC - Training AUCPR (DNA_REPi, Finetuned): {round(AUCTaucpr_finetuned[max_key_paad],2)}")
print(f"TCGA - Test AUC (Fluorouracil, PAAD, Finetuned): {round(AUCTfluorouracil_paad_finetuned[max_key_paad],2)}")
print(f"TCGA - Test AUCPR (Fluorouracil, PAAD, Finetuned): {round(AUCTfluorouracil_paad_aucpr_finetuned[max_key_paad],2)}\n\n")

print("Models for maximum finetuning (Fluorouracil)")
print(f"Exprs_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key}.pt")
print(f"CNA_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key}.pt")
print(f"Class_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key}.pt")

print("\nModels for maximum finetuning (Fluorouracil, STAD)")
print(f"Exprs_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_stad}.pt")
print(f"CNA_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_stad}.pt")
print(f"Class_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_stad}.pt")

print("\nModels for maximum finetuning (Fluorouracil, PAAD)")
print(f"Exprs_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_paad}.pt")
print(f"CNA_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_paad}.pt")
print(f"Class_Fluorouracil_GDSC_TCGA_DNA_REPi_Expression_CNA_Finetuned_{max_key_paad}.pt")

GDSC - Training AUC (DNA_REPi): 0.75
GDSC - Training AUCPR (DNA_REPi): 0.27
TCGA - Test AUC (Fluorouracil): 0.53
TCGA - Test AUCPR (Fluorouracil): 0.77
TCGA - Test AUC (Fluorouracil, STAD): 0.46
TCGA - Test AUCPR (Fluorouracil, STAD): 0.76
TCGA - Test AUC (Fluorouracil, PAAD): 0.38
TCGA - Test AUCPR (Fluorouracil, PAAD): 0.37

GDSC - Training AUC (DNA_REPi, Finetuned): 0.76
GDSC - Training AUCPR (DNA_REPi, Finetuned): 0.27
TCGA - Test AUC (Fluorouracil, Finetuned): 0.52
TCGA - Test AUCPR (Fluorouracil, Finetuned): 0.77

GDSC - Training AUC (DNA_REPi, Finetuned): 0.76
GDSC - Training AUCPR (DNA_REPi, Finetuned): 0.27
TCGA - Test AUC (Fluorouracil, STAD, Finetuned): 0.43
TCGA - Test AUCPR (Fluorouracil, STAD, Finetuned): 0.79


GDSC - Training AUC (DNA_REPi, Finetuned): 0.76
GDSC - Training AUCPR (DNA_REPi, Finetuned): 0.27
TCGA - Test AUC (Fluorouracil, PAAD, Finetuned): 0.38
TCGA - Test AUCPR (Fluorouracil, PAAD, Finetuned): 0.37


Models for maximum finetuning (Fluorouracil)
Exprs_Flu

In [48]:
# create a dataframe with the prediction results 

ec = {"Data": ["Expr + CNA", "Expr + CNA (Fine-tuned)"],
       "AUC (GDSC)": [round(AUCT, 2), round(AUCT_finetuned[max_key], 2)],
       "AUCPR (GDSC)": [round(AUCTaucpr,2), round(AUCTaucpr_finetuned[max_key],2)],
       "AUC (TCGA)": [round(AUCTfluorouracil,2), round(AUCTfluorouracil_finetuned[max_key],2)],
       "AUCPR (TCGA)":  [round(AUCTfluorouracil_aucpr,2), round(AUCTfluorouracil_aucpr_finetuned[max_key],2)],
       "AUC (TCGA-STAD)":  [round(AUCTfluorouracil_stad,2), round(AUCTfluorouracil_stad_finetuned[max_key_stad],2)],
       "AUCPR (TCGA-STAD)": [round(AUCTfluorouracil_stad_aucpr,2), round(AUCTfluorouracil_stad_aucpr_finetuned[max_key_stad],2)],
       "AUC (TCGA-PAAD)":  [round(AUCTfluorouracil_paad,2), round(AUCTfluorouracil_paad_finetuned[max_key_paad],2)],
       "AUCPR (TCGA-PAAD": [round(AUCTfluorouracil_paad_aucpr,2), round(AUCTfluorouracil_paad_aucpr_finetuned[max_key_paad],2)],      
       "Sample Size (GDSC)": [GDSCEv2.shape[0], GDSCEv2.shape[0]],
       "Feature Size (GDSC)": [GDSCEv2.shape[1], GDSCEv2.shape[1]] 
}
ec_dataframe = pd.DataFrame.from_dict(ec, orient='index').transpose() 

ec_dataframe.to_csv(save_results_to + "GDSC_TCGA_Expression_CNA_DNA_REPi_5-Fluorouracil.tsv",
                    sep = "\t",
                    index = False)

ec_dataframe

Unnamed: 0,Data,AUC (GDSC),AUCPR (GDSC),AUC (TCGA),AUCPR (TCGA),AUC (TCGA-STAD),AUCPR (TCGA-STAD),AUC (TCGA-PAAD),AUCPR (TCGA-PAAD,Sample Size (GDSC),Feature Size (GDSC)
0,Expr + CNA,0.75,0.27,0.53,0.77,0.46,0.76,0.38,0.37,9435,772
1,Expr + CNA (Fine-tuned),0.76,0.27,0.52,0.77,0.43,0.79,0.38,0.37,9435,772


In [49]:
# show expression layer parameters

AutoencoderE

AEE(
  (EnE): Sequential(
    (0): Linear(in_features=772, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
  )
)

In [50]:
# show CNA layer parameters

AutoencoderC

AEC(
  (EnC): Sequential(
    (0): Linear(in_features=772, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.6, inplace=False)
  )
)

In [51]:
# show classification layer parameters

Clas

Classifier(
  (FC): Sequential(
    (0): Linear(in_features=96, out_features=1, bias=True)
    (1): Dropout(p=0.6, inplace=False)
    (2): Sigmoid()
  )
)

In [52]:
# change directory

os.chdir(save_results_to)
print(os.getcwd())

/Volumes/Expansion/Thesis Work/Results/Transfer Learning/Drugs with same pathways/TCGA_DNA_REPi/Expression_CNA/Predictions


In [53]:
# save predictions

file = open('GDSC_TCGA_DEGs - TCGA_DNA_REPi_5-Fluorouracil Predictions (Expression and CNA).txt', 'w')
file.write(f"GDSC Training (EC) AUC (TCGA_DNA_REPi): {round(AUCT,2)}\n")
file.write(f"GDSC Training (EC) AUCPR (TCGA_DNA_REPi): {round(AUCTaucpr,2)}\n\n")

file.write(f"TCGA Test (EC) AUC (5-Fluorouracil): {round(AUCTfluorouracil,2)}\n")
file.write(f"TCGA Test (EC) AUCPR (5-Fluorouracil): {round(AUCTfluorouracil_aucpr,2)}\n\n")

file.close()