In [1]:
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, roc_auc_score, classification_report, accuracy_score, roc_curve, confusion_matrix, average_precision_score, precision_recall_curve, r2_score
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

This file presents the training of machine learning models for predicting the shape and size of nanomaterials based on synthesis parameters. Two models Random Forest and Extreme Gradient Boosting were used. The results of the models on the test sample are presented below.

Reproducibility is ensured by setting seed using the scikit-learn library.

A description of this section is provided in the article under Shape and size prediction.

In [2]:
df = pd.read_excel('../Datasets/dataset_labeled.xlsx')

In [3]:
# apply scaler for all numerical features
num_cols = df.drop(columns=['Image_id']).iloc[:, :10].select_dtypes(include="number").columns
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [4]:
df['Cube_avg'] = df.loc[:,['Cube_min', 'Cube_max']].mean(axis=1)
df['Stick_avg'] = df.loc[:,['Stick_min', 'Stick_max']].mean(axis=1)
df['Sphere_avg'] = df.loc[:,['Sphere_min', 'Sphere_max']].mean(axis=1)
df['Flat_avg'] = df.loc[:,['Flat_min', 'Flat_max']].mean(axis=1)
df['Amorphous_avg'] = df.loc[:,['Amorphous_min', 'Amorphous_max']].mean(axis=1)

In [5]:
df = df.drop(columns=['Image_id', 'Cube_min', 'Cube_max',
       'Stick_min', 'Stick_max', 'Sphere_min', 'Sphere_max', 'Flat_min',
       'Flat_max', 'Amorphous_min', 'Amorphous_max', 'Stirring, rpm', 'Ca ion, mM', 'CO3 ion, mM', 'Hexadecyltrimethylammonium bromide', 'Triton X-100', '1-Hexanol', 'Methyl alcohol'])

In [6]:
X = df.loc[:,'HCO3 ion, mM':'PVP']

In [7]:
def classify_size(shape, min_size, max_size):
    df['{}_s'.format(shape)] = ((df['{}_avg'.format(shape)] < min_size) & (df['{}'.format(shape)] == 1)).astype(int)
    df['{}_m'.format(shape)] = ((df['{}_avg'.format(shape)] <= max_size) & (df['{}_avg'.format(shape)] >= min_size) & (df['{}'.format(shape)] == 1)).astype(int)
    df['{}_l'.format(shape)] = ((df['{}_avg'.format(shape)] > max_size) & (df['{}'.format(shape)] == 1)).astype(int)

In [8]:
def clf_pipeline(particle, threshold, rs=42):
    y = df[particle].to_list()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=rs, shuffle=True
    )
    gbr = xgb.XGBClassifier(
        objective="binary:logistic",
    )
    gbr.fit(X_train, y_train)
    rfr = RandomForestClassifier(max_depth=6, random_state=0)
    rfr.fit(X_train, y_train)

    sorted_idx = gbr.feature_importances_.argsort()
    sns.set_style("darkgrid")
    sns.set(rc={"figure.figsize": (25, 15)})
    plt.barh(X.columns[sorted_idx], gbr.feature_importances_[sorted_idx])
    plt.xlabel("Xgboost Feature Importance for {} for new dataset".format(particle))
    # plt.savefig('fi_xgb_new_{}.png'.format(particle))
    plt.close()

    sorted_idx = rfr.feature_importances_.argsort()
    sns.set_style("darkgrid")
    sns.set(rc={"figure.figsize": (25, 15)})
    plt.barh(X.columns[sorted_idx], rfr.feature_importances_[sorted_idx])
    plt.xlabel(
        "Random Forest Feature Importance for {} for new dataset".format(particle)
    )
    # plt.savefig('fi_rfr_new_{}.png'.format(particle))
    plt.close()

    y_pred = (gbr.predict_proba(X_test)[:, 1] >= threshold).astype(
        int
    )  # set threshold as 0.3
    # print('GBR:\n', classification_report(y_test, y_pred))
    # print('acc: {acc}, F1: {F1}'.format(acc = "%.2f" % accuracy_score(y_test, y_pred), F1 = "%.2f" % f1_score(y_test, y_pred, average='macro')))
    gbr_acc = "%.2f" % accuracy_score(y_test, y_pred)
    gbr_f1 = "%.2f" % f1_score(y_test, y_pred, average="macro")
    y_pred = (rfr.predict_proba(X_test)[:, 1] >= threshold).astype(
        int
    ) 
    # print('RFR:\n', classification_report(y_test, y_pred))
    # print('acc: {acc}, F1: {F1}'.format(acc = "%.2f" % accuracy_score(y_test, y_pred), F1 = "%.2f" % f1_score(y_test, y_pred, average='macro')))
    rfr_acc = "%.2f" % accuracy_score(y_test, y_pred)
    rfr_f1 = "%.2f" % f1_score(y_test, y_pred, average="macro")
    dummy = DummyClassifier()
    dummy.fit(X_train, y_train)
    y_pred = (dummy.predict_proba(X_test)[:, 1] >= threshold).astype(
        int
    )  # set threshold as 0.3
    dummy_acc = "%.2f" % accuracy_score(y_test, y_pred)
    dummy_f1 = "%.2f" % f1_score(y_test, y_pred, average="macro")
    return [gbr_acc, rfr_acc, gbr_f1, rfr_f1, dummy_acc, dummy_f1], rfr, gbr


In [9]:
df_results = pd.DataFrame(
    columns=[
        "Particle",
        "Count",
        "XGB_Acc",
        "RFR_Acc",
        "XGB_F1",
        "RFR_F1",
        "Dummy_Acc",
        "Dummy_F1",
    ]
)
threshholds = [
    0.6,
    0.4,
    0.5,
    0.5,
    0.4,
    0.6,
    0.3,
    0.4,
    0.1,
    0.3,
    0.2,
    0.4,
    0.4,
    0.4,
    0.2,
    0.025,
    0.025,
    0.05,
    0.05,
    0.05,
]
classify_size("Cube", 15, 20)
classify_size("Sphere", 10, 14)
classify_size("Stick", 35, 45)
classify_size("Flat", 23, 30)
classify_size("Amorphous", 12, 20)
for index, particle in enumerate(
    [
        "Cube",
        "Sphere",
        "Stick",
        "Flat",
        "Amorphous",
        "Cube_s",
        "Cube_m",
        "Cube_l",
        "Sphere_s",
        "Sphere_m",
        "Sphere_l",
        "Stick_s",
        "Stick_m",
        "Stick_l",
        "Flat_s",
        "Flat_m",
        "Flat_l",
        "Amorphous_s",
        "Amorphous_m",
        "Amorphous_l",
    ]
):
    metrics, rfr, gbr = clf_pipeline(particle, threshholds[index], rs=41)
    result = np.array([particle, df[particle].sum()])
    result = np.hstack([result, metrics])
    df_results.loc[df_results.shape[0]] = result

In [10]:
df_results

Unnamed: 0,Particle,Count,XGB_Acc,RFR_Acc,XGB_F1,RFR_F1,Dummy_Acc,Dummy_F1
0,Cube,140,0.68,0.8,0.63,0.77,0.67,0.4
1,Sphere,40,0.72,0.8,0.59,0.67,0.8,0.44
2,Stick,84,0.81,0.81,0.8,0.8,0.57,0.36
3,Flat,16,0.87,0.87,0.47,0.47,0.87,0.47
4,Amorphous,34,0.8,0.83,0.55,0.45,0.88,0.47
5,Cube_s,25,0.93,0.9,0.7,0.47,0.9,0.47
6,Cube_m,49,0.72,0.67,0.5,0.56,0.8,0.44
7,Cube_l,66,0.67,0.7,0.63,0.67,0.64,0.39
8,Sphere_s,11,0.83,0.88,0.52,0.72,0.93,0.48
9,Sphere_m,19,0.86,0.86,0.6,0.54,0.91,0.48


In [11]:
# Example of training and using model for shape prediction for Cubic nanoparticle with a threshold 0.6 for first synthesis conditions in dataset
metrics, rfr, gbr = clf_pipeline('Cube', 0.6, rs=41)
rfr.predict(df.iloc[0, :22].to_numpy().reshape(1, -1))

array([1])