In [26]:
import time
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier as Rfc
from sklearn.svm import SVC
from xgboost import XGBClassifier
import sklearn.model_selection
from sklearn.metrics import roc_auc_score, confusion_matrix
import wandb
import os

In [49]:
data_path = f'./data/encoded_data/GRUv3_gelu/mu_d2_epoch_200.parquet'
data = pd.read_parquet(data_path)
model_name = data_path.split('/')[-2].split('_')[-1]
model_epoch = data_path.split('/')[-1].split('_')[-1].split('.')[0]
train, test = sklearn.model_selection.train_test_split(data, test_size=0.1, random_state=42)

RF_params = {'n_estimators': 200, 
             'max_depth': None, 
             'min_samples_split': 2, 
             'min_samples_leaf': 1, 
             'min_weight_fraction_leaf': 0.0, 
             'max_features': 'sqrt',
             'n_jobs': -1
             }
SV_params = {'C': 1.0, 
             'kernel': 'rbf', 
             'degree': 3, 
             'gamma': 'scale', 
             'shrinking': True, 
             'probability': True, 
             'max_iter': -1}
XGB_params = {'tree_method':'gpu_hist'}

RF = Rfc(**RF_params)
SV = SVC(**SV_params)
XGB = XGBClassifier(**XGB_params)

In [50]:
train_X = train.drop(['label', 'smiles'], axis=1)
train_y = train['label']
test_X = test.drop(['label', 'smiles'], axis=1)
test_y = test['label']

In [51]:
#-------------------------#
# model can be RF, XGB, SV
model = SV
#-------------------------#

if model == RF:
    model_params = RF_params
elif model == XGB:
    model_params = XGB_params
elif model == SV:
    model_params = SV_params
else:
    raise ValueError('model not supported')

model.fit(train_X, train_y)

In [52]:
name = model.__str__().split('(')[0]
timestamp = (str(time.localtime()[3]) + '-' +
             str(time.localtime()[4]) + '-' +
             str(time.localtime()[5])
             )
name_extended = (name + '_' 
                 + model_name + '_'
                 + model_epoch + '_'
                 + timestamp)

In [53]:
def evaluate(model, test_X, test_y):  
    predictions = model.predict_proba(test_X)[:, 1]
    df = pd.DataFrame()
    df['pred'] = predictions
    df['label'] = test_y.values
    df['pred'] = df['pred'].apply(lambda x: 1 if x>0.5 else 0)
    accuracy = df[df['pred']==df['label']].shape[0]/df.shape[0]
    roc_auc = roc_auc_score(df['label'], df['pred'])
    tn, fp, fn, tp = confusion_matrix(df['label'], df['pred']).ravel()
    metrics = {
        'accuracy': round(accuracy, 4),
        'roc_auc': round(roc_auc, 4),
        'true_positive': round(tp / df.shape[0], 4),
        'true_negative': round(tn / df.shape[0], 4),
        'false_positive': round(fp / df.shape[0], 4),
        'false_negative': round(fn / df.shape[0], 4)
    }
    return metrics

In [54]:
# save model

if not os.path.exists(f'models/{name_extended}'):
    os.mkdir(f'models/{name_extended}')
with open(f'./models/{name_extended}/model.pkl', 'wb') as file:
    pickle.dump(model, file)

metrics = evaluate(model, test_X, test_y)
# wandb

wandb.init(
    project='sklearn-clf',
    config=model_params,
    name=name_extended
)
wandb.log(metrics)
wandb.finish()

metrics_df = pd.DataFrame(metrics, index=[0])
metrics_df.to_csv(f'models/{name_extended}/metrics.csv', index=False)

0,1
accuracy,▁
false_negative,▁
false_positive,▁
roc_auc,▁
true_negative,▁
true_positive,▁

0,1
accuracy,0.8358
false_negative,0.0964
false_positive,0.0678
roc_auc,0.8066
true_negative,0.5929
true_positive,0.2429
