In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_log_error



In [2]:
# Load dataset
dataframe = pd.read_csv('3DSC_MP.csv',low_memory=False)


In [3]:
# Our targets are the Tc. 
target = dataframe['tc']

#Features are just the MAGPIE columns here
MAG_cols = [col for col in dataframe if col.startswith('MAGPIE')]
MAG_Feat = dataframe[MAG_cols].copy()

In [4]:
MAG_Feat.head()

Unnamed: 0,MAGPIE_frac_sValence,MAGPIE_frac_pValence,MAGPIE_frac_dValence,MAGPIE_frac_fValence,MAGPIE_mean_Number,MAGPIE_maxdiff_Number,MAGPIE_dev_Number,MAGPIE_max_Number,MAGPIE_min_Number,MAGPIE_most_Number,...,MAGPIE_most_SpaceGroupNumber,MAGPIE_NComp,MAGPIE_Comp_L2Norm,MAGPIE_Comp_L3Norm,MAGPIE_Comp_L5Norm,MAGPIE_Comp_L7Norm,MAGPIE_Comp_L10Norm,MAGPIE_CanFormIonic,MAGPIE_MaxIonicChar,MAGPIE_MeanIonicChar
0,0.120352,0.079968,0.79968,0.0,38.804,15,5.7648,47,32,32.0,...,225.0,4,0.597354,0.511849,0.45866,0.439714,0.426636,0,0.323366,0.093655
1,0.130288,0.192893,0.676819,0.0,50.775,5,1.225,52,47,52.0,...,152.0,3,0.660492,0.586928,0.538109,0.520262,0.509067,0,0.007199,0.002632
2,0.121756,0.07984,0.798403,0.0,38.82,15,5.784,47,32,32.0,...,225.0,4,0.587197,0.502494,0.45015,0.431723,0.419241,0,0.323366,0.093074
3,0.135417,0.170139,0.694444,0.0,50.4,5,1.6,52,47,52.0,...,152.0,3,0.674537,0.600231,0.548626,0.528715,0.515182,0,0.025275,0.011986
4,0.508772,0.315789,0.175439,0.0,30.2,42,19.44,56,14,14.0,...,227.0,3,0.689605,0.632748,0.606222,0.60139,0.600168,0,0.236926,0.100589


In [5]:
scaled_features = StandardScaler().fit_transform(MAG_Feat.values)
Scaled_MAG_F = pd.DataFrame(scaled_features, index=MAG_Feat.index, columns=MAG_Feat.columns)
Scaled_MAG_F.head()

Unnamed: 0,MAGPIE_frac_sValence,MAGPIE_frac_pValence,MAGPIE_frac_dValence,MAGPIE_frac_fValence,MAGPIE_mean_Number,MAGPIE_maxdiff_Number,MAGPIE_dev_Number,MAGPIE_max_Number,MAGPIE_min_Number,MAGPIE_most_Number,...,MAGPIE_most_SpaceGroupNumber,MAGPIE_NComp,MAGPIE_Comp_L2Norm,MAGPIE_Comp_L3Norm,MAGPIE_Comp_L5Norm,MAGPIE_Comp_L7Norm,MAGPIE_Comp_L10Norm,MAGPIE_CanFormIonic,MAGPIE_MaxIonicChar,MAGPIE_MeanIonicChar
0,-0.915114,-0.600693,1.577351,-0.709081,0.249505,-1.084175,-0.907095,-0.621032,0.735295,0.083946,...,0.906642,0.738453,-0.827383,-0.924438,-1.006522,-1.041712,-1.068493,-0.355583,0.028104,-0.16329
1,-0.848642,0.253493,1.027706,-0.709081,1.009473,-1.533211,-1.455575,-0.359974,1.63566,1.018616,...,0.026825,-0.157154,-0.190726,-0.29615,-0.415819,-0.468763,-0.499715,-0.355583,-1.064692,-0.950926
2,-0.905717,-0.601659,1.571639,-0.709081,0.250521,-1.084175,-0.904776,-0.621032,0.735295,0.083946,...,0.906642,0.738453,-0.929806,-1.002725,-1.0698,-1.098555,-1.119517,-0.355583,0.028104,-0.168315
3,-0.814328,0.081374,1.106557,-0.709081,0.985667,-1.533211,-1.410269,-0.359974,1.63566,1.018616,...,0.026825,-0.157154,-0.049106,-0.184821,-0.337622,-0.408634,-0.457519,-0.355583,-1.002214,-0.869981
4,1.683475,1.183098,-1.215326,-0.709081,-0.296712,0.128224,0.745086,-0.151127,-0.345142,-0.757256,...,0.930747,-0.157154,0.102837,0.087291,0.090604,0.108318,0.128887,-0.355583,-0.270667,-0.103285


In [6]:
Scaled_MAG_F.shape

(5773, 145)

In [7]:
pca = PCA(.6)
principalComponents = pca.fit_transform(Scaled_MAG_F)
print(pca.n_components_)
df_MAGPIE_PCA = pd.DataFrame(data = principalComponents)

6


In [8]:
df_MAGPIE_PCA.columns = df_MAGPIE_PCA.columns.astype(str)+'_MAGPIE'

In [9]:
df_MAGPIE_PCA.head()

Unnamed: 0,0_MAGPIE,1_MAGPIE,2_MAGPIE,3_MAGPIE,4_MAGPIE,5_MAGPIE
0,0.284246,1.745726,2.041545,-3.365977,-0.839862,-3.981353
1,4.095951,4.684014,8.332123,-0.700112,1.918346,-1.20971
2,0.249922,1.754526,2.019528,-3.379636,-0.822051,-4.051988
3,3.802709,4.463046,8.011247,-0.672811,2.283179,-1.27412
4,-2.389001,-1.470349,0.211697,-0.26928,-0.29392,0.586061


In [10]:
## Now get the dataframe with the class of each material.

In [11]:
class_names = list(dataframe['sc_class'].unique())

In [12]:
class_names

['Other',
 'Heavy_fermion',
 'Chevrel',
 'Oxide',
 'Cuprate',
 'Ferrite',
 'Carbon',
 'OxideHeavy_fermion',
 'Heavy_fermionChevrel']

In [13]:
#Create a dataframe with the fermi energy and class with the correct indices
#Add columns for the classes with 0 everywhere. To be changed later
featuresC = dataframe[['efermi_2','sc_class']]
featuresCC=featuresC.copy()
for name in class_names:
    featuresCC[name]=0
#featuresCC.head()

In [14]:
#Loop through all elements in the data frame 
#Sets the value for the correct class to 1. 
for i in range(featuresCC.shape[0]):
    scc = featuresCC.loc[i]['sc_class']
    featuresCC.loc[i,scc]=1

#We no longer need the column with the class, so let's drop it now. 
featuresCC = featuresCC.drop('sc_class', axis=1)

In [15]:
featuresCC.head()

Unnamed: 0,efermi_2,Other,Heavy_fermion,Chevrel,Oxide,Cuprate,Ferrite,Carbon,OxideHeavy_fermion,Heavy_fermionChevrel
0,4.015543,1,0,0,0,0,0,0,0,0
1,6.066451,1,0,0,0,0,0,0,0,0
2,4.015543,1,0,0,0,0,0,0,0,0
3,6.31406,1,0,0,0,0,0,0,0,0
4,5.202543,1,0,0,0,0,0,0,0,0


In [16]:
#Dataframe with only the classes and no fermi energy
features_Class_Only=featuresCC.drop('efermi_2', axis=1)

In [17]:
features_Class_Only.head()

Unnamed: 0,Other,Heavy_fermion,Chevrel,Oxide,Cuprate,Ferrite,Carbon,OxideHeavy_fermion,Heavy_fermionChevrel
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0


In [18]:
#Now put together PCA reduced MAGPIE df and Classes df.
df_M_C = features_Class_Only.join(df_MAGPIE_PCA)

#df_M_C.head()

In [19]:
#Now put together PCA reduced MAGPIE df and fermi energy and Classes df.
df_M_C_F = featuresCC.join(df_MAGPIE_PCA)

#df_M_C_F.head()

In [20]:
##### ML on MAGPIE ####

In [21]:
#Apply ML on MAGPIE Only 
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_MAGPIE_PCA, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

0.9251336015692898

In [22]:
##### ML on MAGPIE and Classes ####

In [23]:
#Apply ML on MAGPIE and Classes 
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_M_C, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

0.9247060482428417

In [24]:
##### ML on MAGPIE , Classes , Fermi Energy ####

In [25]:
#Apply ML on MAGPIE , Classes , and Fermi Energy
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_M_C_F, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

0.9200622900860884

In [26]:
### Also, bring in SOAP features  ####

In [27]:
#Features are just the MAGPIE columns here
SOAP_cols = [col for col in dataframe if col.startswith('SOAP')]
SOAP_Feat = dataframe[SOAP_cols].copy()
#SOAP_Feat.head()

In [28]:
#Scale the features
scaled_features = StandardScaler().fit_transform(SOAP_Feat.values)
Scaled_SOAP_F = pd.DataFrame(scaled_features, index=SOAP_Feat.index, columns=SOAP_Feat.columns)
#Scaled_SOAP_F.head()

In [None]:
#Define the PCA
pca = PCA(.6)
#Perform the PCA fit and transform the features into that reduced dimensional space
principalComponents = pca.fit_transform(Scaled_SOAP_F)
print(pca.n_components_)
#Put it into a pandas df
df_SOAP_PCA = pd.DataFrame(data = principalComponents)

In [None]:
df_SOAP_PCA.columns = df_SOAP_PCA.columns.astype(str)+'_SOAP'

In [None]:
#Now put together MAGPIE and SOAP df
df_M_S = df_MAGPIE_PCA.join(df_SOAP_PCA)

#df_M_S.head()

In [None]:
##### ML on MAGPIE and SOAP ####

In [None]:
#Apply ML on MAGPIE and SOAP
#Split the data and targets into test train
X_train, X_test, y_train, y_test = train_test_split(df_M_S, target, test_size=0.33, random_state=21)
#Apply KRR ML
KRR = KernelRidge(kernel="rbf", gamma=2.0,alpha=.1)
KRR.fit(X_train, y_train)

#Predict based on X_test
KRR_pred=KRR.predict(X_test)

#If our prediction for Tc is negative, we should set the Tc to 0.
#Make a copy before we start changing elements in the dataframe to avoind warnings. 
KRR_pred_Pos = KRR_pred.copy()
#Set negative values to positive
for i in range(len(KRR_pred_Pos)):
    if KRR_pred_Pos[i]<0:
        KRR_pred_Pos[i]=0
        
#Find the mean square log error (MSLE)
mean_squared_log_error(y_test, KRR_pred_Pos)

In [None]:
def runKRR_NTimes(N,features,targets,gam,alp):
    MSLE = []
    
    for i in range(N):
        #Split the data and targets into test train
        X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33)
        #Apply KRR ML
        KRR = KernelRidge(kernel="rbf", gamma=gam,alpha=alp)
        KRR.fit(X_train, y_train)

        #Predict based on X_test
        KRR_pred=KRR.predict(X_test)

        #If our prediction for Tc is negative, we should set the Tc to 0.
        #Make a copy before we start changing elements in the dataframe to avoind warnings. 
        KRR_pred_Pos = KRR_pred.copy()
        #Set negative values to positive
        for i in range(len(KRR_pred_Pos)):
            if KRR_pred_Pos[i]<0:
                KRR_pred_Pos[i]=0

        #Find the mean square log error (MSLE)
        MSLE.append(mean_squared_log_error(y_test, KRR_pred_Pos))
    return MSLE
