In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Load Entire dataset
dataframe = pd.read_csv('3DSC_MP.csv',low_memory=False)


In [6]:
dataframe.shape

(5773, 8952)

In [7]:
dataframe.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula_2,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [3]:
####  Create the MAGPIE df    #######

MAG_cols = [col for col in dataframe if col.startswith('MAGPIE')]
print(len(MAG_cols))
MAG_Feat = dataframe[MAG_cols].copy()

#Scaling the MAGPIE features for PCA
scaled_features = StandardScaler().fit_transform(MAG_Feat.values)
Scaled_MAG_F = pd.DataFrame(scaled_features, index=MAG_Feat.index, columns=MAG_Feat.columns)

#Perform PCA
pca = PCA(7)
principalComponents = pca.fit_transform(Scaled_MAG_F)
print(pca.n_components_)
df_MAGPIE_PCA = pd.DataFrame(data = principalComponents)

#Rename the columns to indicate MAGPIE
df_MAGPIE_PCA.columns = df_MAGPIE_PCA.columns.astype(str)+'_MAGPIE'



145
7


In [64]:
#Save df as a csv
df_MAGPIE_PCA.to_csv('MAGPIE.csv',index=False)


In [65]:
pca.explained_variance_ratio_

array([0.25148027, 0.10584605, 0.08681497, 0.06712283, 0.05919039,
       0.05104559, 0.03863514])

In [4]:
####  Create the SOAP df    #######
SOAP_cols = [col for col in dataframe if col.startswith('SOAP')]
print(len(SOAP_cols))
SOAP_Feat = dataframe[SOAP_cols].copy()

#Scale the features
scaled_features = StandardScaler().fit_transform(SOAP_Feat.values)
Scaled_SOAP_F = pd.DataFrame(scaled_features, index=SOAP_Feat.index, columns=SOAP_Feat.columns)

#Define the PCA
pca = PCA(4)
#Perform the PCA fit and transform the features into that reduced dimensional space
principalComponents = pca.fit_transform(Scaled_SOAP_F)
print(pca.n_components_)
#Put it into a pandas df
df_SOAP_PCA = pd.DataFrame(data = principalComponents)

#Rename the columns to indicate SOAP
df_SOAP_PCA.columns = df_SOAP_PCA.columns.astype(str)+'_SOAP'


8715
4


In [41]:
#Save df as a csv
df_SOAP_PCA.to_csv('SOAP.csv',index=False)

In [42]:
pca.explained_variance_ratio_

array([0.01886993, 0.01487697, 0.01354006, 0.01217929])

In [7]:
####  Create a df without SOAP or MAGPIE features    #######
Non_M_S_cols = [col for col in dataframe if not (col.startswith('SOAP')|col.startswith('MAGPIE'))]
Basic_Feat = dataframe[Non_M_S_cols].copy()


In [67]:
IS_SC=Basic_Feat['tc'].apply(lambda x: 1 if x>0 else 0)
IS_SC=IS_SC.rename('IS_SC')     

In [71]:
Basic_Feat = pd.concat([Basic_Feat,IS_SC],axis=1)

In [80]:
#Basic_Feat['IS_SC'].value_counts()

In [81]:
#IS_SC

In [82]:
#Save df as a csv
Basic_Feat.to_csv('Basic_Features.csv',index=False)

In [5]:
## Now get the dataframe with the class of each material.
class_names = list(dataframe['sc_class'].unique())
print(len(class_names))

#Create a dataframe with the fermi energy and class with the correct indices
#Add columns for the classes with 0 everywhere. To be changed later
featuresC = dataframe[['efermi_2','sc_class']]
featuresCC=featuresC.copy()
for name in class_names:
    featuresCC[name]=0

    
#Loop through all elements in the data frame 
#Sets the value for the correct class to 1. 
for i in range(featuresCC.shape[0]):
    scc = featuresCC.loc[i]['sc_class']
    featuresCC.loc[i,scc]=1

#We no longer need the column with the class or efermi_2, so let's drop it now. 
featuresCC = featuresCC.drop(['sc_class','efermi_2'], axis=1)


9


In [13]:
#Save df as a csv
featuresCC.to_csv('Class.csv',index=False)