In [1]:
# import libraries
import pandas as pd

In [2]:
# import the function class for creating models
from scripts.model_creation import MLPipeline

In [3]:
# load data
df1 = pd.read_csv('data/CCDC_descriptors.csv', delimiter=';', decimal=',')
df2 = pd.read_csv('data/ChEMBL_descriptors.csv')

In [4]:
# initialize a variable to use class functions 
pipeline = MLPipeline(df_cocrystals = df1, df_ChEMBL_molecules = df2) 

In [5]:
# process the dataset
df_descriptors = pipeline.create_dataset()

#### **Part 1**: Creating a model to predict the mechanical parameter 'Unobstructed'

In [6]:
# select features to predict the mechanical parameter (check correlations, feature importance)
df_unobstructed = pipeline.select_features(target_property = 'Unobstructed')

In [7]:
# test 8 different ML models before and after data processing
pipeline.test_models_plots('Unobstructed', df_unobstructed)

In [8]:
# random search for optimal hyperparameters
pipeline.optimization_random_search('Unobstructed', df_unobstructed)

Best Score of Random Search: 0.7418623231360016
Best Hyperparameters of Random Search: {'learning_rate': 0.07727009010465356, 'max_depth': 2, 'n_estimators': 233, 'subsample': 0.986757445598954}


In [8]:
# set parameters for searching hyperparameters on the grid based on random search results
list_learning_rate = [0.06, 0.07, 0.08]
list_n_estimators = [200, 225, 250]
list_subsample = [0.8, 0.9, 1.0]
list_max_depth = [2, 3, 4]

In [9]:
# grid search for optimal hyperparameters
optimal_hyperparameters = pipeline.optimization_grid_search('Unobstructed', df_unobstructed, list_learning_rate, list_n_estimators, list_subsample, list_max_depth)

Best Score of Grid Search: 0.7439331460529368
Best Hyperparameters of Grid Search: {'learning_rate': 0.06, 'max_depth': 3, 'n_estimators': 250, 'subsample': 0.9}


In [10]:
# check the metrics and save the model checkpoint
pipeline.see_model_scores_and_save(optimal_hyperparameters, 'Unobstructed', df_unobstructed)

Final accuracy score for Unobstructed:  0.7313432835820896
Final F1 score Unobstructed:  0.7743732590529249


#### **Part 2**: Creating a model to predict the mechanical parameter 'Orthogonal planes'

In [6]:
# select features to predict the mechanical parameter (check correlations, feature importance)
df_orthogonal_planes = pipeline.select_features(target_property = 'Orthogonal planes')

In [7]:
# test 8 different ML models before and after data processing
pipeline.test_models_plots('Orthogonal planes', df_orthogonal_planes, threshold  = 'Y', threshold_old  = 'N')

In [8]:
# random search for optimal hyperparameters
pipeline.optimization_random_search('Orthogonal planes', df_orthogonal_planes)

Best Score of Random Search: 0.8021971083219503
Best Hyperparameters of Random Search: {'learning_rate': 0.01144468203232474, 'max_depth': 7, 'n_estimators': 204, 'subsample': 0.32295789055104174}


In [8]:
# set parameters for searching hyperparameters on the grid based on random search results
list_learning_rate = [0.01, 0.02, 0.03]
list_n_estimators = [175, 200, 225]
list_subsample = [0.3, 0.4, 0.5]
list_max_depth = [6, 7, 8]

In [9]:
# grid search for optimal hyperparameters
optimal_hyperparameters = pipeline.optimization_grid_search('Orthogonal planes', df_orthogonal_planes, list_learning_rate, list_n_estimators, list_subsample, list_max_depth)

Best Score of Grid Search: 0.8034416352384961
Best Hyperparameters of Grid Search: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 225, 'subsample': 0.5}


In [13]:
# check the metrics and save the model checkpoint
pipeline.see_model_scores_and_save(optimal_hyperparameters, 'Orthogonal planes', df_orthogonal_planes, threshold  = 'Y')

Final accuracy score for Orthogonal planes:  0.785240464344942
Final F1 score Orthogonal planes:  0.589540412044374


#### **Part 3**: Creating a model to predict the mechanical parameter 'H-bonds bridging'

In [6]:
# select features to predict the mechanical parameter (check correlations, feature importance)
df_h_bond_bridging = pipeline.select_features(target_property = 'H-bonds bridging')

In [7]:
# test 8 different ML models before and after data processing
df_results = pipeline.test_models_plots('H-bonds bridging', df_h_bond_bridging)

In [8]:
# random search for optimal hyperparameters
pipeline.optimization_random_search('H-bonds bridging', df_h_bond_bridging)

Best Score of Random Search: 0.6969449527360407
Best Hyperparameters of Random Search: {'learning_rate': 0.07727009010465356, 'max_depth': 2, 'n_estimators': 233, 'subsample': 0.986757445598954}


In [9]:
# set parameters for searching hyperparameters on the grid based on random search results
list_learning_rate = [0.07, 0.08, 0.09]
list_n_estimators = [200, 225, 250]
list_subsample = [0.7, 0.8, 0.9]
list_max_depth = [2, 3, 4]

In [10]:
# grid search for optimal hyperparameters
optimal_hyperparameters = pipeline.optimization_grid_search('H-bonds bridging', df_h_bond_bridging, list_learning_rate, list_n_estimators, list_subsample, list_max_depth)

Best Score of Grid Search: 0.6967366247719848
Best Hyperparameters of Grid Search: {'learning_rate': 0.07, 'max_depth': 2, 'n_estimators': 250, 'subsample': 0.9}


In [11]:
# check the metrics and save the model checkpoint
pipeline.see_model_scores_and_save(optimal_hyperparameters, 'H-bonds bridging', df_h_bond_bridging)

Final accuracy score for H-bonds bridging:  0.7338308457711443
Final F1 score H-bonds bridging:  0.7595505617977527
