In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score

In [2]:
file_path = '../dataset/2021Yan-SP-HEA'
data_file = 'preprocessed.csv'
df = pd.read_csv(f'{file_path}/{data_file}')
num_ftrs = ['k','vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
# num_ftrs = ['vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
target_ftrs = ['Class']
X = df[num_ftrs]
y = df[target_ftrs]
df.sample(5)

Unnamed: 0,Alloys,alloy_sep,no_elements,k,vm,tm,vac,delta,delta_chi,delta_s_mix,delta_h_mix,Class
1638,Ta0.75W0.25,"[('Ta', '0.75'), ('W', '0.25')]",2,0.600185,0.075056,0.917197,0.682243,0.091161,0.110322,0.210609,0.417965,1
1135,Fe0.75Mn0.25,"[('Fe', '0.75'), ('Mn', '0.25')]",2,0.411555,0.019064,0.426603,0.252336,0.0,0.15445,0.210609,0.443243,1
299,Co1.5Cr1Fe1Ni1.5Ti0.5Mo1.5,"[('Co', '1.5'), ('Cr', '1'), ('Fe', '1'), ('Ni...",6,0.476285,0.027269,0.527746,0.248331,0.083419,0.229684,0.742432,0.398877,0
1239,Al0.8CrFe1.5MnNi0.5,"[('Al', '0.8'), ('Cr', 1), ('Fe', '1.5'), ('Mn...",5,0.374071,0.026445,0.407029,0.214953,0.121899,0.162055,0.659212,0.371389,1
1357,Pt0.15Ru0.85,"[('Pt', '0.15'), ('Ru', '0.85')]",2,0.585939,0.039699,0.659319,0.351402,0.040964,0.0,0.147273,0.440649,1


In [3]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 clf, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.20, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('clf', clf)])
    grid = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,
#                         cv=kf, scoring=make_scorer(precision_score))
                        cv=kf, scoring=make_scorer(accuracy_score))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

## Logistic regression with L$_1$ penalty 

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
param_grid = {'clf__C': np.logspace(-4, 2, 11)}
best_scores = []
best_params = []
for i in range(10):
    lr = LogisticRegression(penalty='l1', max_iter=10000, solver='saga', tol=0.01)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=lr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [6]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.786.
std of the best score is 0.014.


In [6]:
best_scores

[0.7707182320441989,
 0.7762430939226519,
 0.7900552486187845,
 0.7817679558011049,
 0.7928176795580111,
 0.8176795580110497,
 0.7817679558011049,
 0.7679558011049724,
 0.7955801104972375,
 0.7817679558011049]

In [7]:
best_params

[{'clf__C': 6.30957344480193},
 {'clf__C': 25.11886431509577},
 {'clf__C': 6.30957344480193},
 {'clf__C': 6.30957344480193},
 {'clf__C': 100.0},
 {'clf__C': 6.30957344480193},
 {'clf__C': 100.0},
 {'clf__C': 25.11886431509577},
 {'clf__C': 6.30957344480193},
 {'clf__C': 25.11886431509577}]

## RandomForest classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
max_depths = [int(x) for x in np.linspace(2, 20, num=5)]
min_samples_splits = range(2, 12, 2)
param_grid = { "clf__max_depth" : max_depths, 
               "clf__min_samples_split" : min_samples_splits }
max_depths

[2, 6, 11, 15, 20]

In [10]:
best_scores = []
best_params = []
for i in range(10):
    clf = RandomForestClassifier(random_state=42*i, n_estimators=100)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=clf, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [11]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.887.
std of the best score is 0.011.


[{'clf__max_depth': 20, 'clf__min_samples_split': 6},
 {'clf__max_depth': 11, 'clf__min_samples_split': 6},
 {'clf__max_depth': 11, 'clf__min_samples_split': 2},
 {'clf__max_depth': 11, 'clf__min_samples_split': 8},
 {'clf__max_depth': 11, 'clf__min_samples_split': 4},
 {'clf__max_depth': 11, 'clf__min_samples_split': 8},
 {'clf__max_depth': 15, 'clf__min_samples_split': 2},
 {'clf__max_depth': 20, 'clf__min_samples_split': 2},
 {'clf__max_depth': 15, 'clf__min_samples_split': 2},
 {'clf__max_depth': 11, 'clf__min_samples_split': 2}]

In [12]:
best_scores

[0.8756906077348067,
 0.8729281767955801,
 0.8812154696132597,
 0.8729281767955801,
 0.8812154696132597,
 0.9033149171270718,
 0.9005524861878453,
 0.9005524861878453,
 0.8867403314917127,
 0.8922651933701657]

## Neural network classifier

In [13]:
from sklearn.neural_network import MLPClassifier

In [14]:
nnc = MLPClassifier(max_iter=1000, solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=827, alpha=0.0001, hidden_layer_sizes=[10])
# alphas = [0.0001, 0.001]
hls =  [[5, 5], [3, 3]]
param_grid = {"clf__hidden_layer_sizes" : hls}

In [15]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=nnc, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [16]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.785.
std of the best score is 0.015.


[{'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [3, 3]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [3, 3]},
 {'clf__hidden_layer_sizes': [5, 5]}]

## Gradient boosting classifier

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
max_depths = [int(x) for x in np.linspace(10, 30, num=3)]
min_samples_splits = range(2, 6, 2)
param_grid = { "clf__max_depth" : max_depths,
               "clf__min_samples_split" : min_samples_splits}


In [19]:
best_scores = []
best_params = []
for i in range(10):
    clf = GradientBoostingClassifier(random_state=42*i, n_estimators=100)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=clf, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [20]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.887.
std of the best score is 0.014.


[{'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2}]