In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score


In [2]:
# Load dataset
df_SOAP = pd.read_csv('SOAP.csv')
df_MAGPIE = pd.read_csv('MAGPIE.csv')
df_Basic = pd.read_csv('Basic_Features.csv')
df_Class = pd.read_csv('Class.csv')
target = df_Basic['tc']

In [3]:
df_Basic.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula_2,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [4]:
featuresC = df_Basic[['efermi_2','sc_class']]


In [5]:
featuresC.head()

Unnamed: 0,efermi_2,sc_class
0,4.015543,Other
1,6.066451,Other
2,4.015543,Other
3,6.31406,Other
4,5.202543,Other


In [6]:
#Function that allows us to run the ML several times. Average the output. 
def runKRR_NTimes(N,features,targets,gam,alp):
    MSLE = []
    
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(y_test, KRR_pred_Pos))
    return MSLE


In [7]:
df_M_S = df_SOAP.join(df_MAGPIE)
df_FMS = featuresC.join(df_M_S)
df_FMS = df_FMS.drop('sc_class',axis=1)

In [8]:
df_FMS.head()

Unnamed: 0,efermi_2,0_SOAP,1_SOAP,2_SOAP,3_SOAP,0_MAGPIE,1_MAGPIE,2_MAGPIE,3_MAGPIE,4_MAGPIE,5_MAGPIE,6_MAGPIE
0,4.015543,0.087405,2.350832,0.716843,2.457795,0.284246,1.745726,2.041545,-3.365978,-0.839857,-3.981347,0.122126
1,6.066451,-6.277936,-0.735617,0.489058,2.256497,4.095951,4.684014,8.332122,-0.700109,1.91835,-1.209732,2.599394
2,4.015543,0.113213,2.377825,0.791783,2.481894,0.249922,1.754526,2.019528,-3.379637,-0.822045,-4.051984,0.115422
3,6.31406,-6.406671,-0.469702,0.563018,2.287582,3.802709,4.463046,8.011247,-0.672809,2.28318,-1.274138,2.947561
4,5.202543,7.747548,1.573962,-8.671539,0.448773,-2.389001,-1.470349,0.211697,-0.269278,-0.293928,0.586055,-0.910411


In [9]:
# res = runKRR_NTimes(10,df_M_S,target,.1,.12)
# sum(res)/len(res)

In [10]:
#Function that allows us to run the ML several times. This function scales the Tc
#Average the output of this function after using. 
def runKRR_NTimes_log(N,features,targets,gam,alp):
    MSLE = []
    targets_L = np.arcsinh(targets)
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets_L, test_size=0.2)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(np.sinh(y_test), np.sinh(KRR_pred_Pos)))
    return MSLE


In [17]:
res = runKRR_NTimes_log(10,df_FMS,target,.1,.12)
sum(res)/len(res)

0.6927468087738539

In [12]:
#For MAGPIE 7 PCA features and SOAP 4 PCA features .1, .12 hyperparams: MSLE_avg=0.67

In [13]:
#Function that allows us to run the ML several times. This function scales the Tc
#Average the output of this function after using. 
def runKRR_log_R2(N,features,targets,gam,alp):
    R2 = []
    targets_L = np.arcsinh(targets)
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets_L, test_size=0.2)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        R2.append(r2_score(np.sinh(y_test), np.sinh(KRR_pred_Pos)))
    return R2


In [18]:
res = runKRR_log_R2(10,df_FMS,target,.1,.12)
sum(res)/len(res)

0.6438364622353557