In [1]:
import time
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier as Rfc
from sklearn.svm import SVC
from xgboost import XGBClassifier
import sklearn.model_selection
from sklearn.metrics import roc_auc_score, confusion_matrix
import wandb
import os

In [27]:
data_path = f'./data/encoded_data/GRUv3_std_tails/mu_d2_epoch_140.parquet'
data = pd.read_parquet(data_path)
model_name = data_path.split('/')[-2].split('_')[-1]
model_epoch = data_path.split('/')[-1].split('_')[-1].split('.')[0]
train, test = sklearn.model_selection.train_test_split(data, test_size=0.1, random_state=42)

RF_params = {'n_estimators': 200, 
             'max_depth': None, 
             'min_samples_split': 2, 
             'min_samples_leaf': 1, 
             'min_weight_fraction_leaf': 0.0, 
             'max_features': 'sqrt',
             'n_jobs': -1
             }
SV_params = {'C': 50, 
             'kernel': 'rbf', 
             'degree': 3, 
             'gamma': 'scale', 
             'shrinking': True, 
             'probability': True, 
             'max_iter': -1}
XGB_params = {'tree_method':'gpu_hist'}

RF = Rfc(**RF_params)
SV = SVC(**SV_params)
XGB = XGBClassifier(**XGB_params)

In [28]:
train_X = train.drop(['label', 'smiles'], axis=1)
train_y = train['label']
test_X = test.drop(['label', 'smiles'], axis=1)
test_y = test['label']

In [29]:
#-------------------------#
# model can be RF, XGB, SV
model = SV
#-------------------------#

if model == RF:
    model_params = RF_params
elif model == XGB:
    model_params = XGB_params
elif model == SV:
    model_params = SV_params
else:
    raise ValueError('model not supported')

model.fit(train_X, train_y)

In [30]:
name = model.__str__().split('(')[0]
timestamp = (str(time.localtime()[3]) + '-' +
             str(time.localtime()[4]) + '-' +
             str(time.localtime()[5])
             )
name_extended = (name + '_' 
                 + model_name + '_'
                 + model_epoch + '_'
                 + timestamp)

In [31]:
def evaluate(model, test_X, test_y):  
    predictions = model.predict_proba(test_X)[:, 1]
    df = pd.DataFrame()
    df['pred'] = predictions
    df['label'] = test_y.values
    df['pred'] = df['pred'].apply(lambda x: 1 if x>0.5 else 0)
    accuracy = df[df['pred']==df['label']].shape[0]/df.shape[0]
    roc_auc = roc_auc_score(df['label'], df['pred'])
    tn, fp, fn, tp = confusion_matrix(df['label'], df['pred']).ravel()
    metrics = {
        'accuracy': round(accuracy, 4),
        'roc_auc': round(roc_auc, 4),
        'true_positive': round(tp / df.shape[0], 4),
        'true_negative': round(tn / df.shape[0], 4),
        'false_positive': round(fp / df.shape[0], 4),
        'false_negative': round(fn / df.shape[0], 4)
    }
    return metrics

In [32]:
# save model

if not os.path.exists(f'models/{name_extended}'):
    os.mkdir(f'models/{name_extended}')
with open(f'./models/{name_extended}/model.pkl', 'wb') as file:
    pickle.dump(model, file)

metrics = evaluate(model, test_X, test_y)
# wandb

wandb.init(
    project='sklearn-clf',
    config=model_params,
    name=name_extended
)
wandb.log(metrics)
wandb.finish()

metrics_df = pd.DataFrame(metrics, index=[0])
metrics_df.to_csv(f'models/{name_extended}/metrics.csv', index=False)

0,1
accuracy,▁
false_negative,▁
false_positive,▁
roc_auc,▁
true_negative,▁
true_positive,▁

0,1
accuracy,0.8436
false_negative,0.0915
false_positive,0.0649
roc_auc,0.8224
true_negative,0.5745
true_positive,0.2691


In [38]:
df1 = pd.read_parquet('data/encoded_data/GRUv3_std_tails/mu_d2_epoch_140.parquet')

In [39]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,label,smiles
0,-0.138566,0.617441,1.290718,1.703221,-1.111823,2.402695,0.16558,-0.134978,0.496483,-1.410663,...,0.539092,-1.202745,0.217991,-1.530631,-0.079655,-0.450289,-1.52747,0.862628,0,CC1Cc2ccccc2N1C(=O)CN1CCN(CCc2ccc(Cl)cc2)CC1
1,-0.823956,0.422209,-0.068458,-0.443445,-0.308531,1.511106,0.863407,-0.581193,-0.039079,0.724934,...,0.709161,0.111472,-0.108498,-0.921411,-1.142715,-1.543491,0.604726,-1.098411,0,Cc1cccn2ncc(CN3CCN(c4ccc(Cl)cc4)CC3)c12
2,1.725901,0.218435,1.128266,-1.707688,-0.399019,0.722445,0.212358,-0.258817,2.234128,1.188225,...,2.507658,-0.217076,-0.829548,-2.212125,-1.726268,-0.464046,0.865684,-0.067523,1,O=C1c2ccccc2C(=O)N1CCCCN1CC=C(c2c[nH]c3ccc(F)c...
3,-1.804558,-0.694444,0.026563,0.07885,1.026008,1.243023,0.653969,-0.652287,-2.110506,1.54377,...,1.203577,1.038115,-1.467167,-2.079267,-1.073771,1.047264,0.008989,-0.955094,0,OC(CCCN1CCN(c2ccccn2)CC1)c1ccc(F)cc1
4,-1.098961,0.892145,-0.300179,-0.39006,-2.805569,0.463258,-0.140106,-0.316375,-0.199842,1.342545,...,0.921991,-1.141605,0.283501,-1.154359,-2.020115,1.05517,0.953195,1.256225,0,COc1ccccc1C1CC1CNCCC1CCC(NC(=O)c2ccccc2)CC1


In [43]:
import numpy as np
fps = np.array(df1)

In [53]:
norm = np.linalg.norm(fps, axis=1)
print(f'Norm: {norm.mean()} +- {norm.std()}')

Norm: 6.465952396392822 +- 1.224114179611206


In [54]:
df2 = pd.read_csv('results/SVC_tails_140_12-50-54_14-23-21/latent_vectors.csv')

In [58]:
mean_score = df2['score'].mean()
std_score = df2['score'].std()
mean_norm = df2['norm'].mean()
std_norm = df2['norm'].std()
print(f'Score: {mean_score} +- {std_score}')
print(f'Norm: {mean_norm} +- {std_norm}')

Score: 0.9494201575324068 +- 0.03319092361817382
Norm: 11.456172135882905 +- 1.1029274760723307
