In [1]:
from torch_geometric.datasets import MoleculeNet
import torch
import numpy as np
import pandas as pd
from sklearn import metrics
import scipy.stats as stats
import models

## Import Dataset

In [2]:
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

class FingerprintDataset(Dataset):
    def __init__(self,split):
        # Initialize data, download, etc.
        # read with numpy or pandas
        xy = np.loadtxt(f'./fingerprints/bace_fingerprints_reg_ale_{split}.csv', delimiter=',', dtype=np.float32, skiprows=1)

        self.x = torch.from_numpy(xy[:,1:])
        self.y = torch.from_numpy(xy[:,[0]])
        self.n_samples = xy.shape[0]

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples

In [3]:
train_dataset = FingerprintDataset(split='train')
test_dataset = FingerprintDataset(split='test')

In [4]:
#Specify device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

## Set Hyperparameters

In [5]:
import best_config

no_of_epochs = best_config.GLOBALPARAMETERS['no_of_epochs']

In [6]:
#Get Model Parameters
hyperparameters = eval('best_config.'+'MLP'+'_HYPERPARAMETERS')

model_params = hyperparameters
model_params['feature_size'] = train_dataset.x.shape[1]

## Define Model

In [7]:
from torch.utils.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import DataLoader

# Root mean squared error
loss_fn = torch.nn.MSELoss()

NUM_FINGERPRINTS_PER_BATCH = model_params['batch_size']
train_loader = DataLoader(train_dataset, 
                    batch_size=NUM_FINGERPRINTS_PER_BATCH, shuffle=True)
test_loader = DataLoader(test_dataset, 
                    batch_size=NUM_FINGERPRINTS_PER_BATCH, shuffle=True)

## Training Loop

In [8]:
class Engine:
    def __init__(self, model, model_params, optimizer, device):
        self.model = model
        self.model_params = model_params
        self.optimizer = optimizer
        self.device = device
        self.loss_fn = torch.nn.MSELoss()

    def train(self, loader):
        self.model.train()
        # Enumerate over the data
        final_loss = 0
        for i, (fingerprint, labels) in enumerate(loader):
            self.optimizer.zero_grad()  
            fingerprint = fingerprint.to(self.device)
            labels = labels.to(self.device)
            # Forward pass
            outputs = self.model(fingerprint)
            loss = self.loss_fn(outputs, labels)
            # Backward and optimize
            final_loss += loss.item()
            loss.backward()
            self.optimizer.step() 
        return final_loss / len(loader)
    
    def evaluate(self, loader):
            self.model.eval()
            # Enumerate over the data
            final_loss = 0
            for i, (fingerprint, labels) in enumerate(loader):  
                fingerprint = fingerprint.to(self.device)
                labels = labels.to(self.device)
                # Forward pass
                outputs = self.model(fingerprint)
                loss = self.loss_fn(outputs, labels)
                final_loss += loss.item()
            return final_loss / len(loader)
    
    def test(self, loader):
        model.eval()
        true_values = []
        predictions = []
        with torch.no_grad():
            n_correct = 0
            n_samples = 0
            for fingerprint, labels in loader:
                fingerprint = fingerprint.to(device)
                labels = labels.to(device)
                output = model(fingerprint)
                true_values += labels.tolist()
                predictions += output.tolist()
        df = pd.DataFrame({'y_pred':predictions,'y_real':true_values})
        df = df.applymap(lambda x : x[0])
        return df
    
def get_results(df):
    mse = metrics.mean_squared_error(df["y_real"],df["y_pred"])
    sc = stats.spearmanr(df["y_real"],df["y_pred"])[0]
    pc = np.corrcoef(df['y_real'],df['y_pred'])[0,1]
    r = metrics.r2_score(df["y_real"],df["y_pred"])

    results = {
        'Mean Square Error':mse,
        'Spearman':sc,
        'Pearson':pc,
        'R':r,
    }
    results_df = pd.DataFrame([results])
    return results_df

In [9]:
results = {
            'Mean Square Error':[],
            'Spearman':[],
            'Pearson':[],
            'R':[],
        }
summary_df = pd.DataFrame(results)
for i in range(1):
    print(f"Starting training {i}")
    model = eval('models.'+'MLP_REGRESSION'+'(model_params)')
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=model_params['learning_rate'])  
    eng = Engine(model, model_params, optimizer, device)
    for epoch in range(no_of_epochs):
        loss = eng.train(train_loader)
    df = eng.test(test_loader)
    summary_df = summary_df.append(get_results(df))
    
filepath = '/home/rajeckidoyle/Documents/Classification/BACE_Classification/regressionandunccertainty/baseline_results/results'
summary_df.to_csv(filepath+'MLP_summary.csv')


Starting training 0


In [10]:
filepath = '/home/rajeckidoyle/Documents/Classification/BACE_Classification/regressionandunccertainty/results/ground_truth/'
summary_df.to_csv(filepath+'MLP_summary.csv')