In [13]:
import json
import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
df = np.load('../qm9_filtered.npy', allow_pickle = True)

In [16]:
df_X = []
df_y = []

for line in df:
    num = len(line['chiral_centers'])
    if num != 1:
        continue
    
    deg = line['rotation'][1]
    if deg > 0:
        sign = 1
    elif deg < 0:
        sign = 0
    else:
        continue
    
    df_X.append(line['xyz'].flatten())
    df_y.append(sign)
    
df_X = np.array(df_X)
df_y = np.array(df_y)

In [17]:
print(len(df_X), len(df_y))

22923 22923


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state=120)

In [41]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterSampler


max_depth_values = [5, 50, 100, 280, 420]

# Initialize results list
results = []

# Iterate over different depths
for depth in max_depth_values:
    print(depth)
    clf = RandomForestClassifier(max_depth=depth, n_estimators = 1200, min_samples_leaf = 2, max_features = 'sqrt', random_state=40)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X_train)
    
    test_acc = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

5
[[1938  469]
 [1425  753]]
50
[[1807  600]
 [ 711 1467]]
100
[[1808  599]
 [ 708 1470]]
280
[[1808  599]
 [ 708 1470]]
420
[[1808  599]
 [ 708 1470]]
