In [None]:
### Notebook for Domain Adaptation in QSAR
# Based on data from FLuid Notebook
# Need to find a way to pull the datasets from this and use them here


import torch
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'mps'
# get this to work using mps
print(f"Using {device} device")

In [None]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    device = torch.device("mps")

# device = "cpu"

In [None]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, AllChem
IPythonConsole.ipython_useSVG=True 
Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C")


In [None]:
#Center output
#from IPython.display import display, HTML
CSS = """
.output {
    align-items: center;
}
"""
#HTML('<style>{}</style>'.format(CSS))

# Enables large output display
#from IPython.core.display import display, HTML
#display(HTML("<style>div.output_scroll { height: 44em; }</style>"))

#from google.colab import data_table
#data_table.enable_dataframe_formatter()

from IPython.display import display, HTML

def show(obj):
  display(HTML(obj.to_html(escape=False)))


In [None]:
# First we need to load in the data from the fluid notebook
# TODO setup script to generate and then pull data from fluid notebook
# then load data



In [None]:
from IPython.display import display
import FLuID as fluid
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


# import parameters ### TODO actually extract parameters to a separate file

import parameters.base_parameters as parameters

print(parameters.lhasa_params)


k = 8

params = {
    
    # experiment details
    'details' : 3,                  # level of detail of the experiment (low=1,medium=2,high=3,full=4)
    
    # datafiles
'training_data_file' : 'hERG_lhasa_training',
    'test_data_file' : 'hERG_lhasa_test',
'transfer_data_file' : 'FLuID_full',
  'fluid_label_file' : 'FLuID_labels',
    
    # data sampling
   'validation_ratio': 0.2,         # ratio validation/training
     'transfer_size' : 50000 ,      # sample for the transfer data (-1 = all)
         'test_size' : -1,          # sample for the test data (-1 = all)
     'training_size' : -1,          # sample for the training data (-1 = all)

    # number of teacher/clusters (kMean)
                 'k' : k,           # number of clusters (kMean)
     'smooth_factor' : 0.05,        # level of post-clustering mixing to avoid fully biased teachers
    
    # teachers
 'teacher_algorithm' : 'rf',        # algorithm used to build the teacher models
    
    # students
 'federated_student' : 'F' + str(k),
      'student_size' : 10000,                                              # size of the student (number of labelled Cronos data used)
      'student_sizes' : [100,250,500, 1000,2500,5000,10000,25000,50000],   # sizes of the student ti study the impact of the size
 'student_algorithm' : 'rf',                                               # default algorithm used to build the student models
      'student_mode' : 'balanced',                                         # default mode used to select the student data 
    
    # random seed for reproductibility
      'random_state' : 42,

    # t-SNE settings
         'tsne_size' : 500,
   'tsne_iterations' : 1000,
    
    # replication level
    'replicate_count' : 3,
    
    # fonts
       'figure_font' : dict(family="Arial",size=14,color="black"),
 'small_figure_font' : dict(family="Arial",size=10,color="black"),

    # colors
'figure_color_scale' : [(0,"red"),(0.2,"orange"), (0.3,'yellow'),(1,'green')],
        'bar_colors' : px.colors.qualitative.Prism,
         'green_map' : plt.get_cmap('Greens')
}

base_params = params.copy()


base_params["FP_type"] = "ECFP4"
base_params["FP_radius"] = 2
base_params["FP_length"] = 2**11


base_params["regressor_layers"] = [base_params["FP_length"], 
                                   base_params["FP_length"], 
                                   base_params["FP_length"]//2**2, 
                                   base_params["FP_length"]//2**4, 
                                   1] # slightly modified from the paper to use powers of 2 for convenience
base_params["regressor_dropout"] = [0.33] # taken from paper
base_params["max_epochs"] = 100
base_params["batch_size"] = 2**7
base_params["learning_rate"] = 10**-4 ### TODO Check this is correct - find this from the paper
base_params["convergence_threshold"] = 0.01

base_params["convergence_criterion"] = ""

base_params["base_checkpoint_dir"] = "model_checkpoints"
base_params["base_results_dir"] = "model_results"
os.makedirs(base_params["base_results_dir"] , exist_ok=True)

base_params["data_dir"] = "data"

for dataset in ["training_data", "test_data", "transfer_data", "validation_data", "label_table", "federated_data"]:
  base_params[dataset] = os.path.join(base_params["data_dir"], dataset + ".pkl")


FT_params = base_params.copy()



In [None]:
import importlib
importlib.reload(fluid)
import numpy as np



In [None]:
### Load datasets -- TODO make this a seperate script to pull from fluid notebook

# Federated - load in transfer data
federated_data = pd.read_pickle(base_params["federated_data"])

# Clean - load in training data
clean_data = pd.read_pickle(base_params["training_data"])

# Validation - load in validation data
validation_data = pd.read_pickle(base_params["validation_data"])


# Target - load in test data
target_data = pd.read_pickle(base_params["test_data"])


#pre calculate fingerprints for all molecules

### Currently just computed within the fluid notebook


### split the data into training and validation sets

### Currently just using the split from the fluid notebook

In [None]:
federated_data.CLASS.to_numpy()

In [None]:
import numpy as np
from torch.nn.modules.module import Module
from torch import nn 
import torch.nn.functional as F

class Classifier(torch.nn.Module):
    def __init__(self, layersize=[2**11, 2**11, 2**9, 2**7, 2**0], dropout=0.33):
        super(Classifier, self).__init__()
        self.hidden = nn.ModuleList()
        self.batchnorm = nn.ModuleList()
        self.dropout = dropout

        for idx, layer in enumerate(layersize[:-2]):
            self.hidden.append(nn.Linear(layersize[idx], layersize[idx+1]))
            self.batchnorm.append(nn.BatchNorm1d(layersize[idx+1]))

        self.output = nn.Linear(layersize[-2], layersize[-1])  # output layer for binary classification


        # save names for each layer
        for idx, layer in enumerate(self.hidden):
            self.hidden[idx].name = f"hidden_{idx}"
        for idx, layer in enumerate(self.batchnorm):
            self.batchnorm[idx].name = f"batchnorm_{idx}"
        self.output.name = "output"



    def forward(self, x):
        for idx, layer in enumerate(self.hidden):
            # print(f"hidden layer {idx} output shape: {x.shape}")
            x = F.relu(self.hidden[idx](x))
            x = F.dropout(x, self.dropout, training=self.training)
            x = self.batchnorm[idx](x)

            if idx == len(self.hidden) - 1:
                last_hidden = x  # save activation of last hidden layer
        # print(f"output layer input shape: {x.shape}")
        # print(f"output layer output shape: {self.output(x).shape}")
        output = torch.sigmoid(self.output(x))  # apply sigmoid activation to output layer for binary classification
    
        return output




In [None]:
# Generate model 

# load Classifier???
# from models import Classifier

model = Classifier()

print(model.parameters)

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.fp = self.dataframe['FP'].to_numpy()
        self.labels = self.dataframe['CLASS'].to_numpy()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        x = torch.tensor(self.fp[index], dtype=torch.float32, device=device)
        y = torch.tensor(self.labels[index], dtype=torch.float32, device=device)
        return x, y


In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, dataframe, rank=None):
        self.dataframe = dataframe
        self.fp = self.dataframe['FP'].to_numpy()
        self.labels = self.dataframe['CLASS'].to_numpy()
        if rank is not None:
            self.rank = self.dataframe['RANK'].to_numpy()



    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        x = torch.tensor(self.fp[index], dtype=torch.float32, device=device)
        y = torch.tensor(self.labels[index], dtype=torch.float32, device=device)
        
        if hasattr(self, rank):
            r = torch.tensor(self.rank[index], dtype=torch.float32, device=device)
            return x, y, r
        else: 
            return x, y


In [None]:
# broad tuning on federated dataset
FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)

from torch.utils.data import DataLoader

# create a dataloader for the federated data

N = 2**10


federated_loader = MyDataset(federated_data)
validation_loader = MyDataset(validation_data)
training_loader = MyDataset(clean_data)
testing_loader = MyDataset(target_data)

N = 25000
federated_loader = DataLoader(federated_loader, batch_size=N, shuffle=True)

N = 128
validation_loader = DataLoader(validation_loader, batch_size=N, shuffle=True)
training_loader = DataLoader(training_loader, batch_size=N, shuffle=True)
testing_loader = DataLoader(testing_loader, batch_size=N, shuffle=True)

In [None]:
for idx, layer in enumerate(base_params["regressor_layers"]):
    print(idx, layer)

In [None]:
### TODO make this work to update the plot during each loop to show the progress
# may have to use interactive mode

def plot_losses(losses, logscale=True, ax=None, title=None, save=False, filename=None):
    """
    Plots the losses for the model
    Inputs:
    - losses ... dictionary of losses
    - ax ... existing matplotlib axes object to plot on (default=None)
    - title ... title of the plot (default=None)
    - save ... boolean to save the plot (default=False)
    - filename ... name of the file to save the plot as (default=None)
    Outputs:
    - plot of the losses
    """
    if title is None:
        title = "Losses on current dataset"
    
    if ax is None:
        ax = plt.gca()
        

    # check for convergence
    conv = 0.99
    convergence = []
    for idx,_ in enumerate(losses[0]):
        convergence.append(conv**(idx))
    pre = ''
    if logscale is True:
        losses = np.log(losses)
        convergence = np.log(convergence)
        pre = 'log '
    # plot the losses
    ax.plot(losses[0], label='train')
    ax.plot(losses[1], label='val')
    ax.plot(convergence, label='exp decay '+str(conv)+'^epoch')
    ax.set_title(title)
    ax.set_xlabel('Epoch')
    ax.set_ylabel(pre+'Loss')
    # ax.set_ylim([-8, 3])
    ax.legend()
    if save is True and filename is not None:
        plt.savefig(filename+'.png')

    plt.show()

    return ax

In [None]:
# save model
import datetime
import glob
def save_model(model, params=None, path=None):
    """Save model dict to file, use parameter dictionary to save to path, 
    if not save to current directory of the current date and time"""
    
    current_date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    name = "model"
    if path is not None:
        prefix = path
    
    if path is None:
        prefix = os.getcwd()

    if params is not None:
        name = params["experiment_name"]
        prefix = params["checkpoint_dir"]


    path = os.path.join(prefix, name+"_"+current_date_time)

    path = path + '.pt'
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")
    return path



def load_model(model, path, latest=False):
    """Load model state dict from file, 
    if latest is True, load latest model from directory"""
    if latest:
        path = max(glob.glob(path + "/*.pt"), key=os.path.getctime)
        print(f"Loading latest model from {path}")

    model.load_state_dict(torch.load(path))

    return model

In [None]:
import torch
import torch.optim as optim

import numpy as np
from torch.nn.modules.module import Module
from torch import nn 
import torch.nn.functional as F



def train_model(model, train_loader, val_loader, num_epochs=1, lr=0.001, weight_decay=0.0):
    # Define loss function and optimizer
    criterion = nn.BCELoss() # CE for classifcation
    # criterion = nn.MSELoss() # MSE for regression

    
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Track losses and accuracies
    train_losses = []
    val_losses = []

    # clear axes for plotting later
    ax = None
    print(device)


    model.to(device)

    # Train loop
    for epoch in range(num_epochs):
        # Set model to train mode
        model.train()

        # Train on batches
        train_loss = 0
        for idx, data in enumerate(train_loader):
            # print(str(epoch)+":"+str(idx), end=)
            x, y = data
            # Move data to GPU
            # x, y = x.to(device), y.to(device)


            # Forward pass
            outputs = model(x)
            # reshape y to match output shape
            y = y.reshape(outputs.shape)
            # loss 
            loss = criterion(outputs, y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Compute average training loss for epoch
        # train_loss = np.mean(train_loss)
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        # Set model to evaluation mode
        model.eval()

        # Evaluate on validation set
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                # Forward pass

                outputs = model(x)
                y = y.reshape(outputs.shape)
                
                # Compute loss
                loss = criterion(outputs, y)

                val_loss += loss.item()

        # Compute average validation loss for epoch

        # val_loss = np.mean(val_loss)
        val_loss /= len(val_loader)
        val_losses.append(val_loss)

        # Print loss for epoch
        print(f"Epoch {epoch + 1}: Train loss = {train_loss:.4f}, Val loss = {val_loss:.4f}")


        # plot_losses([train_losses, val_losses])

    # Print final losses
    print(f"Final: Train loss = {train_loss:.4f}, Val loss = {val_loss:.4f}")
    # save model
    return model, train_losses, val_losses


In [None]:
base_params["experiment_name"] = "lr_1e-3_broad"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
os.makedirs(base_params["checkpoint_dir"], exist_ok=True)
model = Classifier()
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=100, lr=base_params["learning_rate"])
plot_losses([train_losses, val_losses], title="Losses on federated dataset")
save_model(model, params=base_params)

In [None]:
base_params["weight_decay"]=0.42
base_params["learning_rate"]=1e-6
base_params["experiment_name"] = "lr_1e-6_wd_0.42_broad"

# train the model

# from models import train_model
model = Classifier()

model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=100, lr=base_params["learning_rate"],weight_decay=base_params["weight_decay"])
plot_losses([train_losses, val_losses], title="Losses on federated dataset")

save_model(model, params=base_params)

In [None]:
base_params["weight_decay"]=0.22
base_params["learning_rate"]=1e-4
base_params["experiment_name"] = "lr_1e-4_wd_0.22_broad_25K"

# train the model
model = Classifier()
# from models import train_model

model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=base_params["learning_rate"],weight_decay=base_params["weight_decay"])
plot_losses([train_losses, val_losses], title="Losses on federated dataset")

save_model(model, params=base_params)

In [None]:
base_params["weight_decay"]=0.1
base_params["learning_rate"]=1e-5
base_params["experiment_name"] = "lr_1e-5_wd_0.1_broad_25k"
# train the model
model = Classifier()

# from models import train_model

model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=50, lr=base_params["learning_rate"],weight_decay=base_params["weight_decay"])
plot_losses([train_losses, val_losses], title="Losses on federated dataset")

save_model(model, params=base_params)

In [None]:
base_params["weight_decay"]=0.1
base_params["learning_rate"]=1e-5
base_params["experiment_name"] = "lr_1e-1_wd_0.1_clean_only_25k"
# train the model
model = Classifier()

# from models import train_model

model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=100, lr=base_params["learning_rate"],weight_decay=base_params["weight_decay"])
plot_losses([train_losses, val_losses], title="Losses on training dataset")

save_model(model, params=base_params)

In [None]:
### Save model
plot_losses([train_losses, val_losses])

In [None]:
plot_losses([train_losses, val_losses], logscale=False)

In [None]:

save_model(model, FT_params)

In [None]:
model = load_model(model, FT_params["checkpoint_dir"], latest=True)

In [None]:
# Hyperparameter search with Ax - being done in seperate notebook
print("model")
for idx, param in enumerate(model.parameters()):
    print(idx, param.requires_grad, param.shape)

print("hidden")
for idx, param in enumerate(model.hidden.parameters()):
    print(idx, param.requires_grad, param.shape)

print("batchnorm")
for idx, param in enumerate(model.batchnorm.parameters()):
    print(idx, param.requires_grad, param.shape)

print("output")
for idx, param in enumerate(model.output.parameters()):
    print(idx, param.requires_grad, param.shape)

In [None]:
for item in model:
    print(item)

In [None]:
FT_EPOCHS = 100

In [None]:
# no broad tuning

# load model
model = Classifier()
# change params
FT_params["experiment_name"] = "clean_only"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)


FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:
# no broad tuning change lr to 1e-6 

# load model
model = Classifier()
# change params
FT_params["experiment_name"] = "clean_only"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)


FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:
# fine tuning on clean dataset
FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])

# load model
model = load_model(model, FT_params["checkpoint_dir"], latest=True)
# change params
FT_params["experiment_name"] = "FT_clean"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)


FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:
# compare the effect on batch sizes on training speed and accuracy
times = []
final_train_losses = []
final_val_losses = []
batch_sizes = []

import time

for index in range(10, 16):
    
    batch_size = 2**index
    batch_sizes.append(batch_size)

    print(batch_size)
    # create empty model
    model = Classifier()
    # change params in datasets
    N = batch_size


    federated_loader = MyDataset(federated_data)
    validation_loader = MyDataset(validation_data)
    training_loader = MyDataset(clean_data)
    testing_loader = MyDataset(target_data)


    federated_loader = DataLoader(federated_loader, batch_size=N, shuffle=True)
    validation_loader = DataLoader(validation_loader, batch_size=N, shuffle=True)
    training_loader = DataLoader(training_loader, batch_size=N, shuffle=True)
    testing_loader = DataLoader(testing_loader, batch_size=N, shuffle=True)

    # start timer
    start = time.time()

    # train the model for 10 epochs

    model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=5, lr=base_params["learning_rate"])
    # end timer
    end = time.time()

    # append time to list
    print(end-start)

    plot_losses([train_losses, val_losses])

    times.append(end-start)

    # append final losses to list
    final_train_losses.append(train_losses[-1])
    final_val_losses.append(val_losses[-1])

# plot the results batch size vs time
plt.plot(batch_sizes, times)
plt.xlabel("Batch size")
plt.ylabel("Time (s)")
plt.title("Batch size vs time")
plt.show()

# plot the results batch size vs loss
plt.plot(batch_sizes, final_train_losses, label="Training loss")
plt.plot(batch_sizes, final_val_losses, label="Validation loss")
plt.xlabel("Batch size")
plt.ylabel("Loss")
plt.title("Batch size vs loss")
plt.legend()
plt.show()






In [None]:

# function to freeze the layers of a model up to a certain layer index
def freeze_layers(model, layer_index):
    """
    Takes in a model and a layer index, and freezes the hidden layers up to that index
    """
    # accounts for layers and activations are in the same list as well as for 0 indexing
    layer_index = ((layer_index +1)*2)-1
    for i, param in enumerate(model.hidden.parameters()):
        if i > layer_index:
            param.requires_grad = False

    for i, param in enumerate(model.batchnorm.parameters()):
        if i > layer_index:
            param.requires_grad = False

    
    return model            


In [None]:
# experiment freezing layers

FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])

# load model
model = load_model(model, FT_params["checkpoint_dir"], latest=True)

FT_params["experiment_name"] = "FT_clean_freeze_0"
FT_params["freeze_layers"] = 0
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)

model = freeze_layers(model, FT_params["freeze_layers"])

FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:

FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])

# load model
model = load_model(model, FT_params["checkpoint_dir"], latest=True)

FT_params["experiment_name"] = "FT_clean_freeze_1"
FT_params["freeze_layers"] = 1
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)

model = freeze_layers(model, FT_params["freeze_layers"])

FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:

FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])

# load model
model = load_model(model, FT_params["checkpoint_dir"], latest=True)

FT_params["experiment_name"] = "FT_clean_freeze_2"
FT_params["freeze_layers"] = 2
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)

model = freeze_layers(model, FT_params["freeze_layers"])

FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:

FT_params["experiment_name"] = "broad_tuning"
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])

# load model
model = load_model(model, FT_params["checkpoint_dir"], latest=True)

FT_params["experiment_name"] = "FT_clean_freeze_3"
FT_params["freeze_layers"] = 3
FT_params["checkpoint_dir"] = os.path.join(FT_params["base_checkpoint_dir"], FT_params["experiment_name"])
# make directory for checkpoints
os.makedirs(FT_params["checkpoint_dir"], exist_ok=True)

model = freeze_layers(model, FT_params["freeze_layers"])

FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=FT_EPOCHS, lr=base_params["learning_rate"])

save_model(FT_model, params=FT_params)
plot_losses([train_losses, val_losses])

In [None]:
plot_losses([train_losses, val_losses], logscale=False)

In [None]:
# Gradual fine tuning to come later
import math
number_of_gradual_steps = math.log2(math.floor(len(federated_data)/len(clean_data)))

print(number_of_gradual_steps)

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdDistGeom
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
#function that calculates the similarity score between two fingerprints
def calculate_similarity(FP, target_FP, simi_type = 'Tanimoto'):
    """
    Inputs:
    
    - FP ... fingerprint of input compound
    - target_FP ... fingerprint of target compound
    - type ... type of similarity score to be calculated
    
    Outputs:
    - similarity_score ... similarity score between input and compound 
    """
    
    if simi_type is "Tanimoto":
        similarity_score = DataStructs.TanimotoSimilarity(FP, target_FP)
        return similarity_score
    else:
        raise NotImplementedError("Only Tanimoto similarity is currently supported")


# function that calculates the average similarity score of a single compound to the Target set




def calculate_target_similarity(FP, target_set, simi_type = 'Tanimoto', mean = None or "mean" or "median"):
    """
    Inputs:
    
    - FP ... fingerprint of input compound - MUST BE THE SAME BETWEEN INPUTS
    - target_set ... list of fingerprints of target compounds
    - type ... type of similarity score to be calculated
    
    Outputs:
    - similarity_score ... similarity score between input and compound 
    """
    
    similarity_scores = np.array([])
    
    for target_FP in target_set.FP:
        similarity_score = calculate_similarity(FP, target_FP, simi_type = simi_type)
        np.append(similarity_scores, similarity_score)

    if mean is not None:
        if mean is "mean":
            return np.mean(similarity_scores)
        elif mean is "median":
            return np.median(similarity_scores)
        else:
            raise NotImplementedError("mean type must be None, 'mean', or 'median'")
    else:
        return similarity_scores

In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

def calculate_tanimoto_similarity(fp1, fp2):
    # Convert fingerprints to numpy arrays
    try:
        # for pandas dataframes
        arr1 = fp1.to_numpy()[0]
        arr2 = fp2.to_numpy()[0]
    except:
        try:
            # for lists
            arr1 = np.asarray(fp1)
            arr2 = np.asarray(fp2)
        except:
            raise ValueError("Input fingerprints must be pandas slices or lists/arrays")

    # Calculate dot product and norm of each fingerprint
    dot_prod = dot(arr1, arr2)
    norm1 = norm(arr1)
    norm2 = norm(arr2)
    
    # Calculate Tanimoto similarity
    similarity = dot_prod / (norm1**2 + norm2**2 - dot_prod)
    return similarity

# function that calculates the similarity score between two fingerprints
def calculate_similarity(FP, target_FP, simi_type='Tanimoto'):
    if simi_type == 'Tanimoto':
        similarity_score = calculate_tanimoto_similarity(FP, target_FP)
        return similarity_score
    else:
        raise NotImplementedError("Only Tanimoto similarity is currently supported")

# function that calculates the average similarity score of a single compound to the Target set
def calculate_target_similarity(FP, target_set, simi_type='Tanimoto', mean=None):
    similarity_scores = np.array([])

    for target_FP in target_set.FP:
        similarity_score = calculate_similarity(FP, target_FP, simi_type=simi_type)
        np.append(similarity_scores, similarity_score)

    if mean is not None:
        if mean == 'mean':
            return np.mean(similarity_scores)
        elif mean == 'median':
            return np.median(similarity_scores)
        else:
            raise NotImplementedError("mean type must be None, 'mean', or 'median'")
    else:
        return similarity_scores


In [None]:

#define a function that compiles the datasets based on specifications that we set for various combinations of domains (F, S0, T)
def dataset_compiler(F_dataset=None, S0_dataset=None, target_dataset=None, percentages=None, rank = None or 'Tanimoto', random_state=42):
    """
    Compiles the datasets into a single dataset that can then be loaded into the model

    Inputs: 
    - F_dataset ... Federated dataset
    - S0_dataset ... Source dataset 
    - target_dataset ... Target dataset

    Parameters:
    - percentages ... list of percentages for the federated and source datasets
    - rank ... Choose rankings for the datasets when sampling

    Outputs:
    - dataset ... compiled dataset as a pandas dataframe
    """
    
    # check that the datasets are present
    datasets = []
    for dataset in [F_dataset, S0_dataset, target_dataset]:
        if dataset is not None:
            datasets.append(dataset)

    if all(x is None for x in [F_dataset, S0_dataset, target_dataset]):
        raise ValueError("No datasets have been specified")
    
    # check that the percentages are present if not set to 100%
    if percentages is None:
        percentages = [1]*len(datasets)

    if len(datasets) != len(percentages):
        raise ValueError("The number of datasets must match the number of percentages")
        

    # rank based on the specified ranking
    if rank is not None:
        #check if rank is in the dataset
        for dataset in datasets:
            if rank not in dataset.columns and target_dataset is not None:
                print("Rank {} not in dataset, calculating ranks".format(rank))
                try:
                    # calculate the ranks
                    dataset[rank] = calculate_target_similarity(dataset, target_dataset, rank, mean="mean")
                except:
                    raise ValueError("The rank is not in the dataset and cannot be calculated")

                # how does this work if the rank is not present in the dataset? for example in an empty dataset
                dataset.sort_values(by=rank, ascending=False, inplace=True)
    


    # sample the datasets based on the percentages
    for dataset, percentage in zip(datasets, percentages):
        print("Initial size of dataset: {}".format(len(dataset)))
        print("Sampling {}% of the dataset".format(percentage*100))
        dataset = dataset.sample(frac=percentage, random_state=random_state)
        print("Final size of dataset: {}".format(len(dataset)))


    # combine the datasets
    compiled_dataset = pd.concat(datasets, axis=0)

    return compiled_dataset

In [None]:
# Gradual fine tuning testing - each finetuning step is 1/2 the size of the previous step for the federated dataset
# Each step we compile a dataset with the federated data and the clean data and then train the model on that dataset
import copy
# load in the broad tuned model
model = Classifier()
#create placeholder for grad_FT_model
Grad_FT_model = None

# broad tuning on federated dataset
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-4
weight_decay = 0.0

# model = load_model(model, base_params["checkpoint_dir"], latest=True)
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")



# set the parameters for the gradual fine tuning
gradual_FT_params = copy.deepcopy(base_params)
gradual_FT_params["experiment_name"] = "gradual_FT"
gradual_FT_params["checkpoint_dir"] = os.path.join(gradual_FT_params["base_checkpoint_dir"], gradual_FT_params["experiment_name"])



gradual_FT_params["max_epochs"] = 10
batch_size = 2**10
learning_rate = 1e-3
weight_decay = 0.0
# make directory for checkpoints
os.makedirs(gradual_FT_params["checkpoint_dir"], exist_ok=True)

number_of_gradual_steps = int(math.log2(math.floor(len(federated_data)/len(clean_data))))

for i in range(number_of_gradual_steps):
    temp_params = copy.deepcopy(gradual_FT_params)
    if Grad_FT_model is not None:
        model = Grad_FT_model
    i = i+1
    # calcualate percentages for the datasets
    federated_percentage = 1/(2**i)
    clean_percentage = 1
    percentages = [federated_percentage, clean_percentage]
    # compile the dataset
    compiled_dataset = dataset_compiler(F_dataset=federated_data, S0_dataset=clean_data, percentages=percentages)
    compiled_loader = MyDataset(compiled_dataset)
    compiled_loader = DataLoader(compiled_loader, batch_size=batch_size, shuffle=True)

    
    # train the model using the compiled dataset
    temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(i)
    Grad_FT_model, train_losses, val_losses = train_model(model, compiled_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=learning_rate, weight_decay=weight_decay)

    plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])

    save_model(model, params=temp_params)


# final step with the clean data
temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(-1)

Grad_FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=1e-3)

plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])
save_model(model, params=temp_params)




In [None]:
# Gradual fine tuning testing - each finetuning step is 1/2 the size of the previous step for the federated dataset
# Each step we compile a dataset with the federated data and the clean data and then train the model on that dataset
import copy
# load in the broad tuned model
model = Classifier()
#create placeholder for grad_FT_model
Grad_FT_model = None

# broad tuning on federated dataset
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-4
weight_decay = 0.1

# model = load_model(model, base_params["checkpoint_dir"], latest=True)
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")



# set the parameters for the gradual fine tuning
gradual_FT_params = copy.deepcopy(base_params)
gradual_FT_params["experiment_name"] = "gradual_FT"
gradual_FT_params["checkpoint_dir"] = os.path.join(gradual_FT_params["base_checkpoint_dir"], gradual_FT_params["experiment_name"])



gradual_FT_params["max_epochs"] = 10
batch_size = 2**10
learning_rate = 1e-3
weight_decay = 0.0
# make directory for checkpoints
os.makedirs(gradual_FT_params["checkpoint_dir"], exist_ok=True)

number_of_gradual_steps = int(math.log2(math.floor(len(federated_data)/len(clean_data))))

for i in range(number_of_gradual_steps):
    temp_params = copy.deepcopy(gradual_FT_params)
    if Grad_FT_model is not None:
        model = Grad_FT_model
    i = i+1
    # calcualate percentages for the datasets
    federated_percentage = 1/(2**i)
    clean_percentage = 1
    percentages = [federated_percentage, clean_percentage]
    # compile the dataset
    compiled_dataset = dataset_compiler(F_dataset=federated_data, S0_dataset=clean_data, percentages=percentages)
    compiled_loader = MyDataset(compiled_dataset)
    compiled_loader = DataLoader(compiled_loader, batch_size=batch_size, shuffle=True)

    
    # train the model using the compiled dataset
    temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(i)
    Grad_FT_model, train_losses, val_losses = train_model(model, compiled_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=learning_rate, weight_decay=weight_decay)

    plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])

    save_model(model, params=temp_params)


# final step with the clean data
temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(-1)

Grad_FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=1e-3)

plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])
save_model(model, params=temp_params)




In [None]:
# Gradual fine tuning testing - each finetuning step is 1/2 the size of the previous step for the federated dataset
# Each step we compile a dataset with the federated data and the clean data and then train the model on that dataset
import copy
# load in the broad tuned model
model = Classifier()
#create placeholder for grad_FT_model
Grad_FT_model = None

# broad tuning on federated dataset
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-4
weight_decay = 0.1

# model = load_model(model, base_params["checkpoint_dir"], latest=True)
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")



# set the parameters for the gradual fine tuning
gradual_FT_params = copy.deepcopy(base_params)
gradual_FT_params["experiment_name"] = "gradual_FT"
gradual_FT_params["checkpoint_dir"] = os.path.join(gradual_FT_params["base_checkpoint_dir"], gradual_FT_params["experiment_name"])



gradual_FT_params["max_epochs"] = 10
batch_size = 2**10
learning_rate = 1e-3
weight_decay = 0.1
# make directory for checkpoints
os.makedirs(gradual_FT_params["checkpoint_dir"], exist_ok=True)

number_of_gradual_steps = int(math.log2(math.floor(len(federated_data)/len(clean_data))))

for i in range(number_of_gradual_steps):
    temp_params = copy.deepcopy(gradual_FT_params)
    if Grad_FT_model is not None:
        model = Grad_FT_model
    i = i+1
    # calcualate percentages for the datasets
    federated_percentage = 1/(2**i)
    clean_percentage = 1
    percentages = [federated_percentage, clean_percentage]
    # compile the dataset
    compiled_dataset = dataset_compiler(F_dataset=federated_data, S0_dataset=clean_data, percentages=percentages)
    compiled_loader = MyDataset(compiled_dataset)
    compiled_loader = DataLoader(compiled_loader, batch_size=batch_size, shuffle=True)

    
    # train the model using the compiled dataset
    temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(i)
    Grad_FT_model, train_losses, val_losses = train_model(model, compiled_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=learning_rate, weight_decay=weight_decay)

    plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])

    save_model(model, params=temp_params)


# final step with the clean data
temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(-1)

Grad_FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=1e-3)

plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])
save_model(model, params=temp_params)




In [None]:
# Gradual fine tuning testing - each finetuning step is 1/2 the size of the previous step for the federated dataset
# Each step we compile a dataset with the federated data and the clean data and then train the model on that dataset
import copy
# load in the broad tuned model
model = Classifier()
#create placeholder for grad_FT_model
Grad_FT_model = None

# broad tuning on federated dataset
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-4
weight_decay = 0.1

# model = load_model(model, base_params["checkpoint_dir"], latest=True)
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")



# set the parameters for the gradual fine tuning
gradual_FT_params = copy.deepcopy(base_params)
gradual_FT_params["experiment_name"] = "gradual_FT"
gradual_FT_params["checkpoint_dir"] = os.path.join(gradual_FT_params["base_checkpoint_dir"], gradual_FT_params["experiment_name"])



gradual_FT_params["max_epochs"] = 10
batch_size = 2**10
learning_rate = 1e-4
weight_decay = 0.1
# make directory for checkpoints
os.makedirs(gradual_FT_params["checkpoint_dir"], exist_ok=True)

number_of_gradual_steps = int(math.log2(math.floor(len(federated_data)/len(clean_data))))

for i in range(number_of_gradual_steps):
    temp_params = copy.deepcopy(gradual_FT_params)
    if Grad_FT_model is not None:
        model = Grad_FT_model
    i = i+1
    # calcualate percentages for the datasets
    federated_percentage = 1/(2**i)
    clean_percentage = 1
    percentages = [federated_percentage, clean_percentage]
    # compile the dataset
    compiled_dataset = dataset_compiler(F_dataset=federated_data, S0_dataset=clean_data, percentages=percentages)
    compiled_loader = MyDataset(compiled_dataset)
    compiled_loader = DataLoader(compiled_loader, batch_size=batch_size, shuffle=True)

    
    # train the model using the compiled dataset
    temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(i)
    Grad_FT_model, train_losses, val_losses = train_model(model, compiled_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=learning_rate, weight_decay=weight_decay)

    plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])

    save_model(model, params=temp_params)


# final step with the clean data
temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(-1)

Grad_FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=1e-3)

plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])
save_model(model, params=temp_params)




In [None]:
# Gradual fine tuning testing - each finetuning step is 1/2 the size of the previous step for the federated dataset
# Each step we compile a dataset with the federated data and the clean data and then train the model on that dataset
import copy
# load in the broad tuned model
model = Classifier()
#create placeholder for grad_FT_model
Grad_FT_model = None

# broad tuning on federated dataset
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-5
weight_decay = 0.1

# model = load_model(model, base_params["checkpoint_dir"], latest=True)
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")



# set the parameters for the gradual fine tuning
gradual_FT_params = copy.deepcopy(base_params)
gradual_FT_params["experiment_name"] = "gradual_FT"
gradual_FT_params["checkpoint_dir"] = os.path.join(gradual_FT_params["base_checkpoint_dir"], gradual_FT_params["experiment_name"])



gradual_FT_params["max_epochs"] = 10
batch_size = 2**10
learning_rate = 1e-3
weight_decay = 0.1
# make directory for checkpoints
os.makedirs(gradual_FT_params["checkpoint_dir"], exist_ok=True)

number_of_gradual_steps = int(math.log2(math.floor(len(federated_data)/len(clean_data))))

for i in range(number_of_gradual_steps):
    temp_params = copy.deepcopy(gradual_FT_params)
    if Grad_FT_model is not None:
        model = Grad_FT_model
    i = i+1
    # calcualate percentages for the datasets
    federated_percentage = 1/(2**i)
    clean_percentage = 1
    percentages = [federated_percentage, clean_percentage]
    # compile the dataset
    compiled_dataset = dataset_compiler(F_dataset=federated_data, S0_dataset=clean_data, percentages=percentages)
    compiled_loader = MyDataset(compiled_dataset)
    compiled_loader = DataLoader(compiled_loader, batch_size=batch_size, shuffle=True)

    
    # train the model using the compiled dataset
    temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(i)
    Grad_FT_model, train_losses, val_losses = train_model(model, compiled_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=learning_rate, weight_decay=weight_decay)

    plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])

    save_model(model, params=temp_params)


# final step with the clean data
temp_params["experiment_name"] = gradual_FT_params["experiment_name"] + "_step_{}".format(-1)

Grad_FT_model, train_losses, val_losses = train_model(model, training_loader, validation_loader, num_epochs=gradual_FT_params["max_epochs"], lr=1e-3)

plot_losses([train_losses, val_losses], title="Losses on compiled dataset"+" "+temp_params["experiment_name"])
save_model(model, params=temp_params)




In [None]:
# basic fine tuning with tanimoto similarity applied at each loss calculation
# first we are going to rank based on validation similarity
# calculate similarity ranks for the datasets
rank = 'tanimoto_rank'

# federated_data[rank] = federated_data["FP"].apply(calculate_target_similarity,target_set=validation_data, mean="mean")
clean_data[rank] = clean_data["FP"].apply(calculate_target_similarity, target_set=validation_data, mean="mean")



In [None]:
print(np.sum(clean_data[rank]))


In [None]:
FP = federated_data["FP"].to_numpy()[0]

calculate_tanimoto_similarity(FP, FP)

In [None]:

# create dataset loaders

federated_loader = MyDataset(federated_data, rank=rank)
training_loader = MyDataset(clean_data, rank=rank)

federated_loader = DataLoader(federated_loader, batch_size=25000, shuffle=True)
training_loader = DataLoader(training_loader, batch_size=2**10, shuffle=True)


# load in the broad tuned model
model = Classifier()
base_params["experiment_name"] = "broad_tuning"
base_params["checkpoint_dir"] = os.path.join(base_params["base_checkpoint_dir"], base_params["experiment_name"])
learning_rate = 1e-4
weight_decay = 0.1

model = load_model(model, base_params["checkpoint_dir"], latest=True)
#retrain on the federated data
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")

#try again from scratch
model = Classifier()
model, train_losses, val_losses = train_model(model, federated_loader, validation_loader, num_epochs=20, lr=learning_rate, weight_decay=weight_decay)
plot_losses([train_losses, val_losses], title="Losses on compiled dataset")


