In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error



In [2]:
# Load dataset
dataframe = pd.read_csv('3DSC_MP.csv',low_memory=False)


In [4]:
# Our targets are the Tc. 
target = dataframe['tc']

#Features are just the MAGPIE columns here
SOAP_cols = [col for col in dataframe if col.startswith('SOAP')]
SOAP_Feat = dataframe[SOAP_cols].copy()
SOAP_Feat.head()

Unnamed: 0,SOAP_0,SOAP_1,SOAP_2,SOAP_3,SOAP_4,SOAP_5,SOAP_6,SOAP_7,SOAP_8,SOAP_9,...,SOAP_8705,SOAP_8706,SOAP_8707,SOAP_8708,SOAP_8709,SOAP_8710,SOAP_8711,SOAP_8712,SOAP_8713,SOAP_8714
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Scale the features
scaled_features = StandardScaler().fit_transform(SOAP_Feat.values)
Scaled_SOAP_F = pd.DataFrame(scaled_features, index=SOAP_Feat.index, columns=SOAP_Feat.columns)
Scaled_SOAP_F.head()

Unnamed: 0,SOAP_0,SOAP_1,SOAP_2,SOAP_3,SOAP_4,SOAP_5,SOAP_6,SOAP_7,SOAP_8,SOAP_9,...,SOAP_8705,SOAP_8706,SOAP_8707,SOAP_8708,SOAP_8709,SOAP_8710,SOAP_8711,SOAP_8712,SOAP_8713,SOAP_8714
0,-0.063033,-0.062695,-0.062707,-0.062257,0.061828,-0.059263,-0.062348,-0.061045,-0.062041,0.061441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.063033,-0.062695,-0.062707,-0.062257,0.061828,-0.059263,-0.062348,-0.061045,-0.062041,0.061441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.063033,-0.062695,-0.062707,-0.062257,0.061828,-0.059263,-0.062348,-0.061045,-0.062041,0.061441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.063033,-0.062695,-0.062707,-0.062257,0.061828,-0.059263,-0.062348,-0.061045,-0.062041,0.061441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.063033,-0.062695,-0.062707,-0.062257,0.061828,-0.059263,-0.062348,-0.061045,-0.062041,0.061441,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
#Define the PCA
pca = PCA(.5)
#Perform the PCA fit and transform the features into that reduced dimensional space
principalComponents = pca.fit_transform(Scaled_SOAP_F)
print(pca.n_components_)
#Put it into a pandas df
Reduced_df = pd.DataFrame(data = principalComponents)

63


In [29]:
#Apply ML on this reduced dimensionality data
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(Reduced_df, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

In [30]:
#Predict based on X_test
KRR_pred=KRR.predict(X_test)

In [31]:
#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0

In [32]:
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

2.160254453641939