In [1]:
import json
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

In [2]:
df = np.load('../qm9_filtered.npy', allow_pickle = True)

In [3]:
df_X = []
df_y = []

for line in df:
    num = len(line['chiral_centers'])
    if num != 1:
        continue
    
    df_X.append(line['xyz'].flatten())
    if line['chiral_centers'][0][1] == "R":
        df_y.append(0)
    else:
        df_y.append(1)
    
    
df_X = np.array(df_X)
df_y = np.array(df_y)

In [4]:
print(len(df_X), len(df_y))

22973 22973


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2)

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)

# create random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the model
rfc_random.fit(X_train, y_train)

# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, n_estimators=1800; total time= 1.9min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, n_estimators=1800; total time= 2.0min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=6, n_estimators=1800; total time= 1.9min
[CV] END max_depth=420, max_features=sqrt, min_samples_leaf=4, n_estimators=1200; total time= 1.1min
[CV] END max_depth=420, max_features=sqrt, min_samples_leaf=4, n_estimators=1200; total time= 1.2min
[CV] END max_depth=420, max_features=sqrt, min_samples_leaf=4, n_estimators=1200; total time= 1.2min
[CV] END max_depth=140, max_features=auto, min_samples_leaf=2, n_estimators=400; total time=  27.0s
[CV] END max_depth=140, max_features=auto, min_samples_leaf=2, n_estimators=400; total time=  26.4s
[CV] END max_depth=140, max_features=auto, min_samples_leaf=2, n_estimators=400; total time=  26.0s
[CV] END max_depth=260, max_fe

In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 1800, min_samples_leaf = 2, max_features = 'sqrt', max_depth = 100)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

y_train_pred = clf.predict(X_train)

print("Train acc: ", accuracy_score(y_train, y_train_pred))
print("test acc: ", accuracy_score(y_test, y_pred))
print("f1: ", f1_score(y_test, y_pred))

Train acc:  0.9998911742300577
test acc:  0.738411316648531
f1:  0.7469473684210526


In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1619,  624],
       [ 578, 1774]])

In [None]:
torch.save(net, 'torch_RS.mdl')