In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score

In [2]:
file_path = '../dataset/2021Yan-SP-HEA'
data_file = 'preprocessed.csv'
df = pd.read_csv(f'{file_path}/{data_file}')
# to reproduce the results in Yan 2021. use the same procedue and hyperparameters 
# num_ftrs = ['k','vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
num_ftrs = ['tm', 'delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
# num_ftrs = ['vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
target_ftrs = ['Class']
X = df[num_ftrs]
y = df[target_ftrs]
df.sample(5)

Unnamed: 0,Alloys,alloy_sep,no_elements,k,vm,tm,vac,delta,delta_chi,delta_s_mix,delta_h_mix,Class
1687,Mo0.3NbTiV0.3Zr,"[('Mo', '0.3'), ('Nb', 1), ('Ti', 1), ('V', '0...",5,0.35674,0.088004,0.598808,0.13188,0.134755,0.272183,0.627593,0.442713,1
969,K1Mo1,"[('K', '1'), ('Mo', '1')]",2,0.305369,0.352824,0.390282,0.093458,0.616438,0.853503,0.269946,0.962798,0
1218,Al0.7Co0.3CrFeNi,"[('Al', '0.7'), ('Co', '0.3'), ('Cr', 1), ('Fe...",5,0.404894,0.024233,0.422943,0.231776,0.119904,0.151827,0.653636,0.361412,1
477,Al0.5Cr1Fe1Ni1Ti1V1,"[('Al', '0.5'), ('Cr', '1'), ('Fe', '1'), ('Ni...",6,0.38979,0.035398,0.467261,0.197111,0.096631,0.169206,0.7574,0.337689,0
1672,Nb0.5Ta0.5,"[('Nb', '0.5'), ('Ta', '0.5')]",2,0.490041,0.079894,0.807491,0.411215,0.0,0.063694,0.269946,0.443243,1


In [3]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 clf, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.20, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('clf', clf)])
    grid = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,
#                         cv=kf, scoring=make_scorer(precision_score))
                        cv=kf, scoring=make_scorer(accuracy_score))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

## RandomForest classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
max_depths = [int(x) for x in np.linspace(4, 20, num=8)]
min_samples_splits = range(2, 12, 2)
# n_estimators = [20, 40, 60, 80, 100, 120]
param_grid = { "clf__max_depth" : max_depths, 
               "clf__min_samples_split" : min_samples_splits }
#                "clf__n_estimators" : n_estimators}
max_depths

[4, 6, 8, 10, 13, 15, 17, 20]

In [6]:
best_scores = []
best_params = []
for i in range(10):
    clf = RandomForestClassifier(random_state=42*i, n_estimators=100)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=10, clf=clf, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [7]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.875.
std of the best score is 0.013.


[{'clf__max_depth': 10, 'clf__min_samples_split': 10},
 {'clf__max_depth': 17, 'clf__min_samples_split': 2},
 {'clf__max_depth': 13, 'clf__min_samples_split': 10},
 {'clf__max_depth': 20, 'clf__min_samples_split': 4},
 {'clf__max_depth': 20, 'clf__min_samples_split': 2},
 {'clf__max_depth': 17, 'clf__min_samples_split': 6},
 {'clf__max_depth': 13, 'clf__min_samples_split': 8},
 {'clf__max_depth': 13, 'clf__min_samples_split': 6},
 {'clf__max_depth': 20, 'clf__min_samples_split': 2},
 {'clf__max_depth': 15, 'clf__min_samples_split': 8}]

In [8]:
best_scores

[0.8453038674033149,
 0.8701657458563536,
 0.8729281767955801,
 0.8756906077348067,
 0.8839779005524862,
 0.8812154696132597,
 0.8646408839779005,
 0.8977900552486188,
 0.8812154696132597,
 0.8729281767955801]