In [1]:
device = "GPU" # CPU or GPU (must set manually, don't know how to do this automatically in rapids 22.10)
if device == "GPU":
    print("CUDA is available: using GPU")
    import cudf as pd
    import cupy as np
    import cuml
    from cuml.svm import SVR
    from cuml.model_selection import train_test_split
    from cuml.preprocessing import StandardScaler
    from cuml.metrics import mean_squared_error as mse
    def mape(y_true, y_pred):
        return np.mean(np.abs(y_true-y_pred)/y_true)
    import numpy
else:
    print("CUDA not available: using CPU")
    import numpy as np
    import pandas as pd
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error as mse, r2_score, mean_absolute_percentage_error as mape
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm, trange
import itertools
import time
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from datetime import datetime
import os
import csv

CUDA is available: using GPU


# Set Noise Level and Import Data

In [2]:
noise = 30 # ADJUST level of gaussian noise added to outputs
mod_type = 'svr'
description = mod_type + '_noise-' + str(noise)
filename = '../datasets/fuchs_v3-2_seed-5_points_25000_noise_' + str(noise) + '.csv'  # CHANGE TO DESIRED DATA FILE
df = pd.read_csv(filename)

input_list = ['Intensity_(W_cm2)', 'Target_Thickness (um)', 'Focal_Distance_(um)'] # independent variables
output_list = ['Max_Proton_Energy_(MeV)', 'Total_Proton_Energy_(MeV)', 'Avg_Proton_Energy_(MeV)',
               'Max_Proton_Energy_Exact_(MeV)', 'Total_Proton_Energy_Exact_(MeV)', 'Avg_Proton_Energy_Exact_(MeV)'] # training outputs

X = df[input_list].copy()
y = df[output_list].copy()
X[X.columns[0]] = np.log(X[X.columns[0]]) # Apply log scaling to intensity
for col in y.columns:
    y[col] = np.log(y[col]) # Apply log scaling to energy

dataType = 'float32'

if device == "GPU":
    X = X.to_cupy().astype(dtype=dataType)
    y = y.to_cupy().astype(dtype=dataType)
else:
    X = X.to_numpy().astype(dtype=dataType)
    y = y.to_numpy().astype(dtype=dataType)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle = False)
y_train = y_train[:, 0:3]

pct = 50 # Using 5.000/20,000 points in training set
len_df = int(len(X_train)*(pct/100))
X_train = X_train[0:len_df]
y_train = y_train[0:len_df]

# Apply standard scaler z-score normalization
ss_in = StandardScaler()
ss_in.fit(X_train)
X_train_norm = ss_in.transform(X_train)

ss_out = StandardScaler()
ss_out.fit(y_train)
y_train_norm = ss_out.transform(y_train)

# Train SVR Model

In [3]:
C = 2.5
epsilon = 1e-2
tol = 1e-3

num_inputs = 3
num_outputs = 3

svrs = []
correction_factor = []
for j in range(num_outputs):
    svrs.append(SVR(C=C, epsilon=epsilon, tol=tol))
    svrs[j].fit(X_train_norm, y_train_norm[:, j])
    y_train_predict = svrs[j].predict(X_train_norm)
    y_train_predict_unscaled = np.exp(ss_out.inverse_transform(y_train_predict.reshape(-1, 1).repeat(3, 1)))[:, j]
    correction_factor.append(np.mean(np.exp(y_train[:, j])/y_train_predict_unscaled))
    y_train_predict_corrected = y_train_predict_unscaled*correction_factor[j]
    print(mape(np.exp(y_train[:, j]), y_train_predict_corrected))
    X_train_norm = np.concatenate([X_train_norm, y_train_predict.reshape(-1, 1)], axis=1)
print('correction factors: ', correction_factor)

0.25340605
0.25292885
0.25300977
correction factors:  [array(1.0418509, dtype=float32), array(1.041774, dtype=float32), array(1.0422736, dtype=float32)]


# Define Cost Function for Minimization

In [4]:
def model(X):
    X_mod = np.array(X).reshape(-1, 3)
    X_mod[:,0] = np.log(X_mod[:,0])
    X_scaled = ss_in.transform(X_mod)

    Energies = []
    for j in range(num_outputs):
        y_predict = svrs[j].predict(X_scaled)
        Energies.append(np.exp(ss_out.inverse_transform(y_predict.reshape(-1, 1).repeat(3, 1) ) )[:, j]*correction_factor[j])
        X_scaled = np.concatenate([X_scaled, y_predict.reshape(-1, 1)], axis=1)
    E_max, E_tot, E_avg = Energies # Max Energy, Total Energy, Average Energy
    return (E_max, E_tot, E_avg)

def generate_random_points(bounds, n):
    numpy.random.seed(0)
    points = []
    for bound in bounds:
        points.append(numpy.random.uniform(bound[0], bound[1], n))
    return np.array(points).transpose()

In [5]:
bounds = [(1e19, 1e19), (0.5, 10.0), (0, 10.0)]
n_points = 100000
points = generate_random_points(bounds, n_points)
Emax, Etot, Eavg = model(points)
output_df = pd.DataFrame(columns=['Intensity', 'Thickness', 'Offset', 'E Max', 'E Tot', 'E Avg'])
output_df['Intensity'] = points[:, 0]
output_df['Thickness'] = points[:, 1]
output_df['Offset'] = points[:, 2]
output_df['E Max'] = Emax
output_df['E Tot'] = Etot
output_df['E Avg'] = Eavg
output_df.head()

Unnamed: 0,Intensity,Thickness,Offset,E Max,E Tot,E Avg
0,1e+19,5.584942,2.177699,0.366124,107148280.0,0.09036
1,1e+19,9.08842,5.872497,0.125916,27613222.0,0.034055
2,1e+19,5.272767,9.876133,0.189112,40440328.0,0.047701
3,1e+19,1.458265,9.50867,0.59614,139035440.0,0.120011
4,1e+19,5.512029,3.429715,0.343987,98597472.0,0.085412


In [6]:
output_df.to_pandas().to_csv('predictions_dfs/{}_noise={}_train_pts={}.csv'.format(mod_type, noise, len_df), index=False)