In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error



In [7]:
# Load dataset
df_SOAP = pd.read_csv('SOAP.csv')
df_MAGPIE = pd.read_csv('MAGPIE.csv')
df_Basic = pd.read_csv('Basic_Features.csv')
df_Class = pd.read_csv('Class.csv')
target = df_Basic['tc']

In [9]:
tc_log=np.arcsinh(target)

In [10]:
np.arcsinh(2)

1.4436354751788103

In [11]:
np.arcsinh(130)

5.560696423586702

In [12]:
tc_log

0       1.698007
1       1.508773
2       1.690899
3       1.015973
4       1.879864
          ...   
5768    0.000000
5769    0.000000
5770    0.771847
5771    0.292479
5772    0.879015
Name: tc, Length: 5773, dtype: float64

In [13]:
target

0       2.640000
1       2.150000
2       2.620000
3       1.200000
4       3.200000
          ...   
5768    0.000000
5769    0.000000
5770    0.850800
5771    0.296667
5772    0.996667
Name: tc, Length: 5773, dtype: float64

In [14]:
#Apply ML on SOAP
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_SOAP, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

1.028543940495541

In [18]:
#Apply ML on MAGPIE
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_MAGPIE, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

0.9251336015693028

In [19]:
#Apply ML on MAGPIE
#Split the data and targets into test train
#X_train, X_test, y_train, y_test = train_test_split(df_MAGPIE, tc_log, test_size=0.33, random_state=21)
X_train, X_test, y_train, y_test = train_test_split(df_MAGPIE, tc_log, test_size=0.33,random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(np.sinh(y_test), np.sinh(KRR_pred_Pos))

0.9078747322984414

In [20]:
#Function that allows us to run the ML several times. Average the output. 
def runKRR_NTimes(N,features,targets,gam,alp):
    MSLE = []
    
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(y_test, KRR_pred_Pos))
    return MSLE


In [21]:
df_M_S = df_SOAP.join(df_MAGPIE)

In [24]:
#df_M_S.head()


In [41]:
res = runKRR_NTimes(10,df_M_S,target,.1,.12)
sum(res)/len(res)

0.7698617724193115

In [42]:
#Function that allows us to run the ML several times. This function scales the Tc
#Average the output of this function after using. 
def runKRR_NTimes_log(N,features,targets,gam,alp):
    MSLE = []
    targets_L = np.arcsinh(targets)
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets_L, test_size=0.33)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(np.sinh(y_test), np.sinh(KRR_pred_Pos)))
    return MSLE


In [45]:
res = runKRR_NTimes_log(50,df_M_S,target,.1,.12)
sum(res)/len(res)

0.7319391151900823

In [46]:
res = runKRR_NTimes_log(50,df_MAGPIE,target,.1,.12)
sum(res)/len(res)

0.8461244811123637