In [10]:
import json
import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

In [11]:
df = np.load('../qm9_filtered.npy', allow_pickle = True)

In [12]:
df_X = []
df_y = []

for line in df:
    num = len(line['chiral_centers'])
    if num != 1:
        continue
    
    df_X.append(line['xyz'].flatten())
    if line['chiral_centers'][0][1] == "R":
        df_y.append(0)
    else:
        df_y.append(1)
    
    
df_X = np.array(df_X)
df_y = np.array(df_y)

In [13]:
print(len(df_X), len(df_y))

22973 22973


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state=40)

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterSampler


rfc = RandomForestClassifier(random_state=60)
# number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 6)]
n_estimators = [1800]

# min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)]
min_samples_leaf = [2]

# number of features at every split
# max_features = ['auto', 'sqrt']
max_features = ['sqrt']

# max depth
max_depth = [5, 50, 100, 180]

# create random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc,
                                param_distributions = random_grid,
                                scoring='accuracy',
                                return_train_score=True,
                                n_iter = 4,
                                cv = 3,
                                verbose=2,
                                random_state=42,
                                n_jobs = -1)
# # Random search of parameters
# rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# # Fit the model
# rfc_random.fit(X_train, y_train)

# # print results
# print(rfc_random.best_params_)

In [31]:
# Evaluate each parameter combination
results = []
for params in ParameterSampler(rfc_random.param_distributions, n_iter=rfc_random.n_iter, random_state=rfc_random.random_state):
    print(params)
    # Train model with current parameters
    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate on test data
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Adjust for multi-class

    # Store results
    results.append({**params, 'test_accuracy': acc, 'test_f1_score': f1})

# Convert to DataFrame and display results
df_results = pd.DataFrame(results)

df_results

{'n_estimators': 1800, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5}
{'n_estimators': 1800, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50}
{'n_estimators': 1800, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 100}
{'n_estimators': 1800, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 180}


Unnamed: 0,n_estimators,min_samples_leaf,max_features,max_depth,test_accuracy,test_f1_score
0,1800,2,sqrt,5,0.631556,0.627371
1,1800,2,sqrt,50,0.754951,0.754808
2,1800,2,sqrt,100,0.755169,0.755023
3,1800,2,sqrt,180,0.755169,0.755023


In [32]:
df_results.to_csv('RF_RS_class_result.csv', index=False)