In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score

In [2]:
file_path = '../dataset/2021Yan-SP-HEA'
data_file = 'preprocessed.csv'
df = pd.read_csv(f'{file_path}/{data_file}')
num_ftrs = ['k','vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
# num_ftrs = ['vm','tm','vac','delta','delta_chi', 'delta_s_mix', 'delta_h_mix']
target_ftrs = ['Class']
X = df[num_ftrs]
y = df[target_ftrs]
df.sample(5)

Unnamed: 0,Alloys,alloy_sep,no_elements,k,vm,tm,vac,delta,delta_chi,delta_s_mix,delta_h_mix,Class
624,Rh0.45Ta0.55,"[('Rh', '0.45'), ('Ta', '0.55')]",2,0.742118,0.061368,0.746793,0.504673,0.106226,0.494325,0.267674,0.231707,0
806,Cr1Te1,"[('Cr', '1'), ('Te', '1')]",2,0.293497,0.129766,0.34141,0.373832,0.0,0.280255,0.269946,0.443243,0
16,Mg54Cu26.5Ag8.5Gd11,"[('Mg', '54'), ('Cu', '26.5'), ('Ag', '8.5'), ...",4,0.194262,0.107286,0.248223,0.188037,0.258229,0.376739,0.471282,0.404302,0
33,Co1Cr2Fe1Ni1,"[('Co', '1'), ('Cr', '2'), ('Fe', '1'), ('Ni',...",4,0.446247,0.015747,0.484658,0.254206,0.05325,0.13706,0.559816,0.421723,0
1506,Mo0.9Re0.1,"[('Mo', '0.9'), ('Re', '0.1')]",2,0.643055,0.055,0.78708,0.242991,0.0625,0.099363,0.102989,0.432025,1


In [3]:
def ML_pipeline_GridSearchCV_kfold(X, y, seed, n_folds, 
                                 clf, param_grid):
    ## reg: the regressor
    ## param_grid: hyperparameters to be tuned
    X_other, X_test, y_other, y_test = train_test_split(X, y, 
                   test_size=0.20, random_state=seed)
    kf = KFold(n_splits=n_folds) # no need to shuffle again
    pipe = Pipeline(steps=[('clf', clf)])
    grid = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,
#                         cv=kf, scoring=make_scorer(precision_score))
                        cv=kf, scoring=make_scorer(accuracy_score))
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

## Logistic regression with L$_1$ penalty 

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
param_grid = {'clf__C': np.logspace(-4, 2, 11)}
best_scores = []
best_params = []
for i in range(10):
    lr = LogisticRegression(penalty='l1', max_iter=10000, solver='saga', tol=0.01)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=lr, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [6]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))

Mean of the best score is 0.786.
std of the best score is 0.014.


In [7]:
best_params

[{'clf__C': 6.30957344480193},
 {'clf__C': 25.11886431509577},
 {'clf__C': 6.30957344480193},
 {'clf__C': 25.11886431509577},
 {'clf__C': 25.11886431509577},
 {'clf__C': 6.30957344480193},
 {'clf__C': 25.11886431509577},
 {'clf__C': 25.11886431509577},
 {'clf__C': 25.11886431509577},
 {'clf__C': 6.30957344480193}]

## RandomForest classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
max_depths = [int(x) for x in np.linspace(2, 20, num=5)]
min_samples_splits = range(2, 12, 2)
param_grid = { "clf__max_depth" : max_depths, 
               "clf__min_samples_split" : min_samples_splits }
max_depths

[2, 6, 11, 15, 20]

In [10]:
best_scores = []
best_params = []
for i in range(10):
    clf = RandomForestClassifier(random_state=42*i, n_estimators=100)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=clf, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [11]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.887.
std of the best score is 0.011.


[{'clf__max_depth': 20, 'clf__min_samples_split': 6},
 {'clf__max_depth': 11, 'clf__min_samples_split': 6},
 {'clf__max_depth': 11, 'clf__min_samples_split': 2},
 {'clf__max_depth': 11, 'clf__min_samples_split': 8},
 {'clf__max_depth': 11, 'clf__min_samples_split': 4},
 {'clf__max_depth': 11, 'clf__min_samples_split': 8},
 {'clf__max_depth': 15, 'clf__min_samples_split': 2},
 {'clf__max_depth': 20, 'clf__min_samples_split': 2},
 {'clf__max_depth': 15, 'clf__min_samples_split': 2},
 {'clf__max_depth': 11, 'clf__min_samples_split': 2}]

## Neural network classifier

In [12]:
from sklearn.neural_network import MLPClassifier

In [13]:
nnc = MLPClassifier(max_iter=1000, solver='sgd', 
                   activation='tanh', batch_size=100,
                   learning_rate='adaptive',
                   random_state=827, alpha=0.0001, hidden_layer_sizes=[10])
# alphas = [0.0001, 0.001]
hls =  [[5, 5], [3, 3]]
param_grid = {"clf__hidden_layer_sizes" : hls}

In [14]:
best_scores = []
best_params = []
for i in range(10):
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=nnc, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [15]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.785.
std of the best score is 0.015.


[{'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [3, 3]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [5, 5]},
 {'clf__hidden_layer_sizes': [3, 3]},
 {'clf__hidden_layer_sizes': [5, 5]}]

## Gradient boosting classifier

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
max_depths = [int(x) for x in np.linspace(10, 30, num=3)]
min_samples_splits = range(2, 6, 2)
param_grid = { "clf__max_depth" : max_depths,
               "clf__min_samples_split" : min_samples_splits}


In [18]:
best_scores = []
best_params = []
for i in range(10):
    clf = GradientBoostingClassifier(random_state=42*i, n_estimators=100)
    grid, test_score = ML_pipeline_GridSearchCV_kfold(X, y.values.ravel(), 
          seed=827*i, n_folds=5, clf=clf, param_grid=param_grid)
    best_scores.append(test_score)
    best_params.append(grid.best_params_)

In [19]:
print("Mean of the best score is %.3f." % np.mean(best_scores))
print("std of the best score is %.3f." % np.std(best_scores))
best_params

Mean of the best score is 0.887.
std of the best score is 0.014.


[{'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 4},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2},
 {'clf__max_depth': 10, 'clf__min_samples_split': 2}]