In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from collections.abc import Iterable
import time
import math 
import sys
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import csv
import os
import gpytorch
import warnings
warnings.filterwarnings("ignore", message="The input matches the stored training data. Did you forget to call model.train()?") 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, mean_absolute_percentage_error as mape
import gc
from tqdm import trange, tqdm
dataType = torch.float64
torch.set_default_dtype(dataType)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Exact_GP(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(Exact_GP, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [2]:
noise = 30 # ADJUST level of gaussian noise added to outputs
mod_type = 'gpr'
description = mod_type + '_noise-' + str(noise)
filename = '../datasets/fuchs_v3-2_seed-5_points_25000_noise_' + str(noise) + '.csv'  # CHANGE TO DESIRED DATA FILE
df = pd.read_csv(filename)

input_list = ['Intensity_(W_cm2)', 'Target_Thickness (um)', 'Focal_Distance_(um)'] # independent variables
output_list = ['Max_Proton_Energy_(MeV)', 'Total_Proton_Energy_(MeV)', 'Avg_Proton_Energy_(MeV)',
               'Max_Proton_Energy_Exact_(MeV)', 'Total_Proton_Energy_Exact_(MeV)', 'Avg_Proton_Energy_Exact_(MeV)'] # training outputs
X = df[input_list].copy()
y = df[output_list].copy()
X[X.columns[0]] = np.log(X[X.columns[0]]) # Apply log scaling to intensity
for col in y.columns:
    y[col] = np.log(y[col]) # Apply log scaling to energy

X = X.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle = False)
y_train = y_train[:, 0:3]
pct = 50 # Using all 20,000 points in training/validation set
len_df = int(len(X_train)*(pct/100))
X_train = X_train[0:len_df]
y_train = y_train[0:len_df]

ss_in = StandardScaler()
ss_in.fit(X_train)
X_train_norm = ss_in.transform(X_train)

ss_out = StandardScaler()
ss_out.fit(y_train)
y_train_norm = ss_out.transform(y_train)

X_train_norm = torch.tensor(X_train_norm, dtype=dataType).to(device)
y_train_norm = torch.tensor(y_train_norm, dtype=dataType).to(device)

print('train ds length: ', len(X_train_norm))
print(X_train_norm.dtype)

train ds length:  10000
torch.float64


In [3]:
num_inputs = 3
num_outputs = 3
num_epochs = 30
lr = 2e-1

likelihoods = []
gprs = []
correction_factor = []
for j in range(num_outputs):
    # Train GPR
    likelihoods.append(gpytorch.likelihoods.GaussianLikelihood().to(device))
    gprs.append(Exact_GP(X_train_norm, y_train_norm[:, j], likelihoods[j]).to(device))

    gprs[j].train()
    likelihoods[j].train()

    optimizer = torch.optim.Adam(gprs[j].parameters(), lr=lr)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihoods[j], gprs[j])

    current_it = 0

    while(current_it < num_epochs):
        # Zero the Gradients
        optimizer.zero_grad()

        # Perform Forward Pass
        model_output = gprs[j](X_train_norm)

        # Compute Loss
        loss = -mll(model_output, y_train_norm[:, j])

        # Perform Backward Pass
        loss.backward()

        # Clear cache
        gc.collect()
        torch.cuda.empty_cache()
        current_it += 1

        # Optimization
        optimizer.step()

    gprs[j].eval()
    likelihoods[j].eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        # Make Predictions
        pred_dist_train = likelihoods[j](gprs[j](X_train_norm))
        y_train_predict = pred_dist_train.mean
        y_train_predict_unscaled = np.exp(ss_out.inverse_transform(y_train_predict.cpu().detach().numpy().reshape(-1, 1).repeat(3, 1)))[:, j]

        # Corrections due to Log Scaling
        correction_factor.append(np.mean(np.exp(y_train[:, j])/y_train_predict_unscaled))
        y_train_predict_corrected = y_train_predict_unscaled*correction_factor[j]
        print(mape(np.exp(y_train[:, j]), y_train_predict_corrected))
        
    # Append output to input for next iteration in chained output regression
    X_train_norm = torch.concat([X_train_norm, y_train_predict.reshape(-1, 1)], axis=1)
        
    gc.collect()
    torch.cuda.empty_cache()

0.25488250935377693
0.2551784720498006
0.25473899160757046


In [4]:
def model(X):
    X_mod = X.copy().reshape(-1, 3)
    X_mod[:,0] = np.log(X_mod[:,0])
    X_scaled = torch.Tensor(ss_in.transform(X_mod)).to(device)
    Energies = []
    for j in range(num_outputs):
        y_predict = likelihoods[j](gprs[j](X_scaled)).mean
        Energies.append(np.exp(ss_out.inverse_transform(y_predict.detach().cpu().numpy().reshape(-1, 1).repeat(3, 1) ) )[:, j]*correction_factor[j])
        X_scaled = torch.concat([X_scaled, y_predict.reshape(-1, 1)], axis=1)
    E_max, E_tot, E_avg = Energies # Max Energy, Total Energy, Average Energy
    return (E_max, E_tot, E_avg)

def generate_random_points(bounds, n):
    np.random.seed(0)
    points = []
    for bound in bounds:
        points.append(np.random.uniform(bound[0], bound[1], n))
    return np.array(points).transpose()

def split_points(points, batch_size):
    i = 0
    split_points = []
    while i < len(points):
        if i + batch_size > len(points):
            split_points.append(points[i:])
        else:
            split_points.append(points[i:i+batch_size])
        i += batch_size
    return split_points

In [5]:
for j in range(num_outputs):
    gprs[j].eval()
    likelihoods[j].eval()
bounds = [(1e19, 1e19), (0.5, 10.0), (0, 10.0)]
n_points = 100000
points = generate_random_points(bounds, n_points)
points_array = split_points(points, 1000)
output_df = pd.DataFrame(columns=['Intensity', 'Thickness', 'Offset', 'E Max', 'E Tot', 'E Avg'])
for batch in points_array:
    Emax, Etot, Eavg = model(batch)
    batch_df = pd.DataFrame(columns=['Intensity', 'Thickness', 'Offset', 'E Max', 'E Tot', 'E Avg'])
    batch_df['Intensity'] = batch[:, 0]
    batch_df['Thickness'] = batch[:, 1]
    batch_df['Offset'] = batch[:, 2]
    batch_df['E Max'] = Emax
    batch_df['E Tot'] = Etot
    batch_df['E Avg'] = Eavg
    output_df = pd.concat([output_df, batch_df], ignore_index=True)

output_df.head()

Unnamed: 0,Intensity,Thickness,Offset,E Max,E Tot,E Avg
0,1e+19,5.584942,2.177699,0.373143,111478000.0,0.092978
1,1e+19,9.08842,5.872497,0.134569,30540760.0,0.036712
2,1e+19,5.272767,9.876133,0.17668,38214660.0,0.044813
3,1e+19,1.458265,9.50867,0.549921,135410600.0,0.111632
4,1e+19,5.512029,3.429715,0.350828,101208800.0,0.087357


In [6]:
output_df.to_csv('predictions_dfs/{}_noise={}_train_pts={}.csv'.format(mod_type, noise, len_df), index=False)