This notebook is used to create two ML models. 

One is used for predicting None values in Absorbance column.

One is used for predicting None values in PL max column.

Three algorithms are considered: Bagging, Decision Trees, and Random Forests

In [2]:
import numpy as np
import pandas as pd
import os
import joblib
import sklearn         
from sklearn import linear_model, datasets
from sklearn.utils import resample
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from tqdm import tqdm

# 1. Model for Absorbance 

In [6]:
# Absorbance
df_ab = pd.read_csv('abs_filler.csv')

In [7]:
# This dataset excludes all rows that have 'None' in the 'Absorbance max (nm)' column.
# This dataset is used to train and create a model for predicting absorbance.
df_ab

Unnamed: 0,Growth Temp (Celsius),Metal_mmol (mmol),Chalcogen_mmol (mmol),CA_mmol (mmol),Amines_mmol (mmol),Phosphines_mmol (mmol),S_I_amount (g),S_II_amount (g),Time_min (min),x0_cadmium acetate,...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,Diameter_nm,Absorbance max (nm),PL max (nm)
0,1.105103,-0.437486,-0.243779,-0.525437,-0.416112,-0.165455,-0.123360,-0.302087,-0.226077,0,...,0,1,0,0,1,0,0,3.41,566,575
1,0.404517,-0.371858,-0.550992,0.129713,-0.498129,-0.264629,-0.681811,-0.302087,-0.236151,0,...,0,0,0,0,1,0,0,2.60,526,556
2,0.404517,-0.371858,-0.550992,0.129713,-0.498129,-0.264629,-0.681811,-0.302087,-0.235796,0,...,0,0,0,0,1,0,0,3.20,559,580
3,0.404517,-0.371858,-0.550992,0.129713,-0.498129,-0.264629,-0.681811,-0.302087,-0.235087,0,...,0,0,0,0,1,0,0,3.60,574,598
4,0.404517,-0.371858,-0.550992,0.129713,-0.498129,-0.264629,-0.681811,-0.302087,-0.234420,0,...,0,0,0,0,1,0,0,3.90,582,601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.482360,-0.465119,-0.500554,-0.386536,-0.498129,-0.370585,-0.439276,5.981667,-0.236234,0,...,0,1,0,0,0,0,1,1.70,450,
191,0.482360,-0.465119,-0.500554,-0.386536,-0.498129,-0.370585,-0.439276,5.981667,-0.233377,0,...,0,1,0,0,0,0,1,2.50,530,
192,0.482360,-0.465119,-0.500554,-0.386536,-0.498129,-0.370585,-0.439276,5.981667,-0.206534,0,...,0,1,0,0,0,0,1,3.50,585,
193,0.482360,-0.465119,-0.500554,-0.386536,-0.498129,-0.370585,-0.439276,5.981667,-0.180818,0,...,0,1,0,0,0,0,1,3.70,590,


In [9]:
# Below is all the columns input
# Note: This includes diameter as a input to predict Abs max

input_col = ['Growth Temp (Celsius)', 'Metal_mmol (mmol)', 'Chalcogen_mmol (mmol)',
             'Amines_mmol (mmol)', 'CA_mmol (mmol)', 'Phosphines_mmol (mmol)', 
             'S_I_amount (g)', 'S_II_amount (g)', 'Time_min (min)', 
             'x0_cadmium acetate', 'x0_cadmium acetate dihtdrate', 
             'x0_cadmium acetate dihydrate', 'x0_cadmium oxide', 
             'x0_cadmium stearate', 'x0_dimethylcadmium', 'x1_None', 
             'x1_benzoic acid', 'x1_dodecylphosphonic acid', 
             'x1_ethylphosphonic acid', 'x1_lauric acid', 
             'x1_myrstic acid', 'x1_oleic acid', 'x1_stearic acid',
             'x2_2-6-dimethylpyridine', 'x2_None', 'x2_aniline', 
             'x2_benzylamine', 'x2_dioctylamine/hexadecylamine',
             'x2_dodecylamine', 'x2_heptylamine', 'x2_hexadecylamine', 
             'x2_octadecylamine', 'x2_octylamine', 'x2_oleylamine', 
             'x2_pyridine', 'x2_trioctylamine', 'x3_None', 'x3_diphenylphosphine', 
             'x3_tributylphosphine', 'x3_trioctylphosphine', 
             'x3_triphenylphosphine', 'x4_None', 'x4_liquid parafin', 
             'x4_octadecene', 'x4_phenyl ether', 'x4_trioctylphosphine oxide', 
             'x5_None', 'x5_phosphinic acid', 'x5_trioctylphosphine oxide',
             'Diameter_nm', ]

output_col_ab = ['Absorbance max (nm)']

X_ab = df_ab[input_col]

Y_ab = df_ab[output_col_ab]

In [12]:
# Splitting dataset for training
X_train_ab, X_test_ab, Y_train_ab, Y_test_ab = train_test_split(X_ab, Y_ab, test_size=0.15, random_state=45, shuffle=True)

## 1a. Bagging

In [13]:
# This is a grid search for three parameters in the Bagging algorithm. 
# Parameters are: max_depth, n_estimators, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(5, 80, 5):
            
            B_regr_ab = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
            
            B_Y_pred_ab = B_regr_ab.predict(X_test_ab)
            
            mse = mean_squared_error(Y_test_ab, B_Y_pred_ab)
            
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [02:21<00:00,  7.08s/it]

126.50982360670251 5 9 45





## 1b. Decision Trees

In [14]:
# This is a grid search for three parameters in the Decision Trees algorithm. 
# Parameters are: max_depth, max_features, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(5, 80, 5):
            
            DT_regr_ab = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr_ab.fit(X_train_ab, Y_train_ab)

            DT_Y_pred_ab = DT_regr_ab.predict(X_test_ab)

            mse = mean_squared_error(Y_test_ab, DT_Y_pred_ab)
            
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [00:37<00:00,  1.88s/it]

182.47119592820937 6 20 65





## 1c. Random Forests

In [15]:
# This is a grid search for three parameters in the Random Forest algorithm. 
# Parameters are: max_depth, n_estimators, max_features.
# Random_state is set to 45.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(2, 50, 2):
            RF_regr_ab = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
            RF_Y_pred_ab = RF_regr_ab.predict(X_test_ab)

            mse = mean_squared_error(Y_test_ab, RF_Y_pred_ab)
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [03:03<00:00,  9.17s/it]

91.7186666666666 14 5 28





## Conclusion:

Random forest gives the least value of MSE with the parameters combination of: max_depth= **14**, n_estimators = **5**, max_features = **28**, and random_state = **45**.                  

## Saving model

In [37]:
RF_regr_ab = RandomForestRegressor(max_depth=14, 
                                n_estimators=5, 
                                max_features=28,
                                random_state=45
                                )
RF_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
RF_Y_pred_ab = RF_regr_ab.predict(X_test_ab)

In [38]:
joblib.dump(RF_regr_ab, "./random_forest_ab.joblib")

['./random_forest_ab.joblib']

# 2. Model for Photoluminescence  (PL)

In [19]:
# PL
df_pl = pd.read_csv('pl_filler.csv')

In [20]:
# This dataset excludes all rows that have 'None' in the 'PL max (nm)' column.
# This dataset is used to train and create a model for predicting absorbance.
df_pl

Unnamed: 0,Growth Temp (Celsius),Metal_mmol (mmol),Chalcogen_mmol (mmol),CA_mmol (mmol),Amines_mmol (mmol),Phosphines_mmol (mmol),S_I_amount (g),S_II_amount (g),Time_min (min),x0_cadmium acetate,...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,Diameter_nm,Absorbance max (nm),PL max (nm)
0,1.105103,-0.437486,-0.243779,-0.525437,-0.416112,-0.165455,-0.123360,-0.302087,-0.226077,0,...,0,1,0,0,1,0,0,3.41,566,575
1,-0.062540,-0.472027,-0.436361,-0.305510,-0.498129,-0.375671,-0.241193,-0.302087,-0.235463,0,...,0,1,0,0,1,0,0,2.50,474,617
2,0.326674,-0.472027,-0.587674,-0.525437,-0.498129,-0.451959,-0.527427,-0.302087,-0.235463,0,...,0,1,0,0,1,0,0,1.99,,497
3,0.326674,-0.472027,-0.587674,-0.525437,-0.498129,-0.445602,-0.527427,-0.302087,-0.235463,0,...,0,1,0,0,1,0,0,2.13,,510
4,0.326674,-0.472027,-0.587674,-0.525437,-0.498129,-0.445602,-0.527427,-0.302087,-0.234420,0,...,0,1,0,0,1,0,0,2.27,,517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0.793731,-0.476863,-0.326314,-0.465246,-0.295431,0.023570,-0.464015,-0.302087,-0.226077,0,...,0,1,0,0,1,0,0,4.10,,580
153,0.793731,-0.476863,-0.326314,-0.465246,0.017402,0.023570,-0.464015,-0.302087,-0.236401,0,...,0,1,0,0,1,0,0,3.70,,535
154,0.793731,-0.476863,-0.326314,-0.465246,0.079501,0.023570,-0.464015,-0.302087,-0.236401,0,...,0,1,0,0,1,0,0,4.10,,575
155,0.793731,-0.476863,-0.326314,-0.465246,-0.093905,0.023570,-0.464015,-0.302087,-0.236401,0,...,0,1,0,0,1,0,0,3.60,,530


In [26]:
output_col_pl = ['PL max (nm)']

X_pl = df_pl[input_col]

Y_pl = df_pl[output_col_pl]

In [27]:
# Splitting dataset for training
X_train_pl, X_test_pl, Y_train_pl, Y_test_pl = train_test_split(X_pl, Y_pl, test_size=0.15, random_state=45, shuffle=True)

## 2a. Bagging


In [31]:
# This is a grid search for three parameters in the Bagging algorithm. 
# Parameters are: max_depth, n_estimators, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(5, 80, 5):
            
            B_regr_pl = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
            
            B_Y_pred_pl = B_regr_pl.predict(X_test_pl)
            
            mse = mean_squared_error(Y_test_pl, B_Y_pred_pl)
            
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [02:14<00:00,  6.73s/it]

178.91666666666666 9 1 20





## 2b. Decision Trees

In [32]:
# This is a grid search for three parameters in the Decision Trees algorithm. 
# Parameters are: max_depth, max_features, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(5, 80, 5):
            
            DT_regr_pl = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr_pl.fit(X_train_pl, Y_train_pl)

            DT_Y_pred_pl = DT_regr_pl.predict(X_test_pl)

            mse = mean_squared_error(Y_test_pl, DT_Y_pred_pl)
            
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [00:35<00:00,  1.77s/it]

208.33333333333334 12 13 45





## 2c Random Forest

In [33]:
# This is a grid search for three parameters in the Random Forest algorithm. 
# Parameters are: max_depth, n_estimators, max_features.
# Random_state is set to 45.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mse = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(2, 50, 2):
            RF_regr_pl = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
            RF_Y_pred_pl = RF_regr_pl.predict(X_test_pl)

            mse = mean_squared_error(Y_test_pl, RF_Y_pred_pl)
            if (min_mse > mse):
                min_mse = mse
                min_i = i
                min_j = j
                min_k = k
            
print(min_mse, min_i, min_j, min_k)

100%|██████████| 20/20 [03:04<00:00,  9.24s/it]

176.8154761904759 12 7 8





## Conclusion 

Random forest gives the least value of MSE with the parameters combination of: max_depth= **12**, n_estimators = **7**, max_features = **8**, and random_state = **45**.        

## Saving model

In [39]:
RF_regr_pl = RandomForestRegressor(max_depth=12, 
                                n_estimators=7, 
                                max_features=8,
                                random_state=45
                                )
RF_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
RF_Y_pred_pl = RF_regr_pl.predict(X_test_pl)

In [40]:
joblib.dump(RF_regr_pl, "./random_forest_pl.joblib")

['./random_forest_pl.joblib']