## Locating optimal model hyperparameters

This script does not require GPUs and can be run directly from Jupyter Notebook. However, the default file paths are currently set for execution on Google Colab. Change these variables appropriately, when/if required.

In [2]:
import os 
from os.path import exists as file_exists

**Step 1.** Navigate to the "runs/" directory

In [3]:
# navigate to runs directory
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


**Step 2.** Choose drug/tissue/dataset combination for which you wish to find the optimal hyperparameters 

In [5]:
%cd /content/drive/MyDrive/MBP1413H/tcrp_model/data/output/runs
drug = "GSK429286A"
tissue="liver"
dataset="GDSC"
run_name = dataset + '_' + tissue

/content/drive/MyDrive/MBP1413H/tcrp_model/data/output/runs


**Step 3.** Find best zero correlation

In [6]:
file_list = []
for i in os.listdir(f"{run_name}/predictions/{drug}/{tissue}/"):
    fhandle = f"{run_name}/predictions/{drug}/{tissue}/{i}/log.txt"
    if file_exists(fhandle):
      f = open(fhandle, "r")
      file_list.append([i.strip() for i in f.readlines()])

zero_cors = [float(i[-1].split(f",")[-1]) for i in file_list]
dir_list = os.listdir(f"{run_name}/predictions/{drug}/{tissue}/")

In [7]:
print(f"Best zero corr is {dir_list[zero_cors.index(max(zero_cors))]}")

Best zero corr is 0.1_0.01_2_12


**Step 4.** Find best fewshot correlation

In [8]:
files = os.listdir(f"/content/drive/MyDrive/MBP1413H/tcrp_model/data/results/MAML-run-logs/{tissue}/{drug}/")

In [11]:
mean_list = []
for i in files:
    if i != '.ipynb_checkpoints':
        fhandle = f"/content/drive/MyDrive/MBP1413H/tcrp_model/data/results/MAML-run-logs/{tissue}/{drug}/" + i
        if file_exists(fhandle):
          f = open(fhandle, "r")
          try:
            lines = f.read().splitlines()
            last_line = lines[-1]
            corr_mean = float(last_line.split(' ')[-1])
            mean_list.append(corr_mean)
          except IndexError:
            mean_list.append(0)

In [14]:
log_name = files[mean_list.index(max(mean_list))]
print(f"Max is in {log_name}:", max(mean_list))

Max is in liver_GSK429286A_0.1_0.01_1_12.log: 0.7065883785486221


**Step 5.** Relpace the default TCRP performance output with the optimal TCRP performance output

In [10]:
log_folder = log_name.removeprefix(tissue+'_'+drug+'_').split('.log')[0]
log_arr = log_folder.split('_')
meta_lr = float(log_arr[0])
inner_lr = float(log_arr[1])
layer = int(log_arr[2])
tissue_num = int(log_arr[3].split('.')[0])

*Step 5 - Option 1:* You can re-run the entire TCRP run to update the output performance metrics. This may take up to an hour and will require GPUs.

In [None]:
# Re-run with optimal TCRP hyperparameters:

%cd /content/drive/MyDrive/MBP1413H/tcrp_model/pipelines
!python -m model.MAML_DRUG --dataset {dataset} --tissue {tissue} --drug {drug} --K 10 --num_trials 20 --tissue_num {tissue_num} --meta_batch_size 10 --meta_lr {meta_lr} --inner_lr {inner_lr} --layer {layer} --run_name {run_name}

*Step 5 - Option 1:* You can simply locate the "log.txt" file for the optimal TCRP run, extract the perofmance metrics, and replace these to update the output performance metrics.

In [12]:
%cd /content/drive/MyDrive/MBP1413H/tcrp_model/data/output/runs/
log_file = f"{run_name}/predictions/{drug}/{tissue}/{log_folder}/log.txt"

with open(log_file, "r") as f:
  lines = f.read().split()
  corr_best = lines[3:13]
  corr_zero = lines[13]


for i in range(len(corr_best)):
  if(i == 0):
    corr_best[i] = corr_best[i].split('[')[-1]
  if(i == len(corr_best)-1):
    corr_best[i] = corr_best[i].split(']')[0]

corr_best = [float(x) for x in corr_best] 
corr_zero = float(corr_zero.split(',')[-1])

results = {}
results["TCRP-zero"] = corr_zero
results["TCRP-fewshot"] = corr_best


/content/drive/MyDrive/MBP1413H/tcrp_model/data/output/runs


In [13]:
import numpy as np
base_line_outpath = f"/content/drive/MyDrive/MBP1413H/tcrp_model/data/results/{dataset}/TCRP_performances/{drug}/{tissue}/"

np.savez(
	base_line_outpath + "TCRP_performance", 
	**results
)

**Step 6 (optional).** Verify that results were updated.

In [16]:
arr = np.load(f"{base_line_outpath}/TCRP_performance.npz")
print(arr["TCRP-zero"])
print(arr["TCRP-fewshot"])

array([0.39901213, 0.39933084, 0.39929508, 0.39926415, 0.39929679,
       0.39931253, 0.3993043 , 0.39929309, 0.39930657, 0.39928611])