In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
file_path = '../dataset/2021Yan-SP-HEA'
data_file = '2021Yan_feature_engineered.csv'
df = pd.read_csv(f'{file_path}/{data_file}')
df.sample(5, random_state=827)

Unnamed: 0,Alloys,Class,alloy_sep,normalizer,k,vm,tm,vac,delta_s_mix,delta_chi,delta,delta_h_mix
1244,Fe0.5Ru0.5,1,"[('Fe', 0.5), ('Ru', 0.5)]",1.0,195.0,7.7,2208.65,8.0,5.762824,0.185,3.703704,-7.2
703,Hf1V1,0,"[('Hf', 1.0), ('V', 1.0)]",2.0,133.0,10.975,2344.65,11.5,5.762824,0.165,6.896552,-3.5
1148,Co0.7Ir0.3,1,"[('Co', 0.7), ('Ir', 0.3)]",1.0,221.6,7.252,2053.45,13.2,5.078724,0.146642,0.0,-4.788
346,Al1.45Co1Cr1Fe1Ni1,0,"[('Al', 1.45), ('Co', 1.0), ('Cr', 1.0), ('Fe'...",5.45,146.275229,7.730275,1622.225963,6.853211,13.275451,0.124394,4.415421,-32.902954
696,Pd0.4Si0.6,0,"[('Pd', 0.4), ('Si', 0.6)]",1.0,131.4,10.82,1743.47,6.4,5.595417,0.146969,12.046671,-71.712


In [3]:
regex = fr'([A-Z][a-z]*)(\d*\.*\d*?(?=\D|$))'
df['alloy_sep'] = df['Alloys'].str.findall(regex)
df['alloy_sep'] = df['alloy_sep'].apply(lambda m: [(x, y) if y else (x, 1) for x, y in m])
df['no_elements'] = df['alloy_sep'].apply(lambda m: len(m))
df.sample(5)

Unnamed: 0,Alloys,Class,alloy_sep,normalizer,k,vm,tm,vac,delta_s_mix,delta_chi,delta,delta_h_mix,no_elements
946,Ca1Lu1,0,"[(Ca, 1), (Lu, 1)]",2.0,31.5,23.85,1525.65,9.5,5.762824,0.0,1.408451,19.9,2
580,Co1Fe1Mn1Ti1V0.7Zr1,0,"[(Co, 1), (Fe, 1), (Mn, 1), (Ti, 1), (V, 0.7),...",5.7,136.368421,9.076316,1876.658772,6.22807,14.834401,0.190535,4.785985,-29.317328,6
132,Al1Co2Cr1Cu0.5Fe1Ni1,0,"[(Al, 1), (Co, 2), (Cr, 1), (Cu, 0.5), (Fe, 1)...",6.5,155.692308,7.366154,1672.016154,7.769231,14.232275,0.115318,3.558403,-19.332544,6
163,Al1Co1Cr1Cu0.5Fe1Ni1.5,0,"[(Al, 1), (Co, 1), (Cr, 1), (Cu, 0.5), (Fe, 1)...",6.0,153.75,7.413333,1660.671667,7.75,14.534159,0.119823,3.703704,-20.655556,6
1077,Co1Cr1Fe1Mn1Ni1,1,"[(Co, 1), (Cr, 1), (Fe, 1), (Mn, 1), (Ni, 1)]",5.0,161.0,7.004,1801.35,8.0,13.380862,0.138362,1.774993,-6.544,5


In [4]:
num_ftrs = ['k','vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
target_ftrs = ['Class']

# one-hot encoder
# ohe = OneHotEncoder(sparse=False)
# cat_values = ohe.fit_transform(df[cat_ftrs])
# cat_ftr_names = ohe.get_feature_names()
# df_cat = pd.DataFrame(data=cat_values,columns = cat_ftr_names)

# minmax scaler
minmax = MinMaxScaler()
num_values = minmax.fit_transform(df[num_ftrs])
df_num = pd.DataFrame(data=num_values,columns = num_ftrs) 

# label encoder
le = LabelEncoder()
target_values = le.fit_transform(df[target_ftrs])
df_target = pd.DataFrame(data=target_values, columns=target_ftrs)

# concatenation of all features
df_preprocessed = pd.concat([df[['Alloys', 'alloy_sep', 'no_elements']], df_num, df_target], axis=1)
df_preprocessed.to_csv(f"{file_path}/preprocessed.csv", index=False)
df_preprocessed.drop(columns=['alloy_sep', 'no_elements'], inplace=True)
df_preprocessed.sample(5)

  y = column_or_1d(y, warn=True)


Unnamed: 0,Alloys,k,vm,tm,vac,delta,delta_chi,delta_s_mix,delta_h_mix,Class
1131,Al0.375Co1Cr1Fe1Ni1,0.430795,0.019065,0.442443,0.254206,0.093188,0.139574,0.663147,0.389827,1
375,Al0.67Co1Cr1Cu1Fe1Mn1Ni1Ti1V1,0.383936,0.029718,0.428196,0.229991,0.090351,0.186986,0.949301,0.374344,0
338,Co1.5Cr1Fe1Ni1.5Ti0.5Mo1.8,0.48162,0.028446,0.537716,0.245807,0.085579,0.235527,0.737453,0.400147,0
897,As1Te1,0.110144,0.178314,0.179488,0.542056,0.294118,0.050955,0.269946,0.443243,0
1607,MoNbTaTi0.75W,0.550163,0.069595,0.790869,0.379734,0.086066,0.309543,0.68292,0.417891,1


In [6]:
import pickle
with open('../pkl-files/minmax_scaler.pkl', 'wb') as pf:
    pickle.dump(minmax, pf)