In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error



In [2]:
# Load dataset
df_SOAP = pd.read_csv('SOAP.csv')
df_MAGPIE = pd.read_csv('MAGPIE.csv')
df_Basic = pd.read_csv('Basic_Features.csv')
df_Class = pd.read_csv('Class.csv')
target = df_Basic['tc']

In [3]:
#Function that allows us to run the ML several times. Average the output. 
def runKRR_NTimes(N,features,targets,gam,alp):
    MSLE = []
    
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(y_test, KRR_pred_Pos))
    return MSLE


In [4]:
df_M_S = df_SOAP.join(df_MAGPIE)

In [5]:
# res = runKRR_NTimes(10,df_M_S,target,.1,.12)
# sum(res)/len(res)

In [6]:
#Function that allows us to run the ML several times. This function scales the Tc
#Average the output of this function after using. 
def runKRR_NTimes_log(N,features,targets,gam,alp):
    MSLE = []
    targets_L = np.arcsinh(targets)
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets_L, test_size=0.2)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(np.sinh(y_test), np.sinh(KRR_pred_Pos)))
    return MSLE


In [14]:
res = runKRR_NTimes_log(50,df_M_S,target,.1,.12)
sum(res)/len(res)

0.6771205740265498

In [8]:
#For MAGPIE 7 PCA features and SOAP 4 PCA features .1, .12 hyperparams: MSLE_avg=0.67