In [1]:
import numpy as np
import pandas as pd
import pickle

In [23]:
class VectorDiscriminator:
    def __init__(self, data_path:str , model, model_params:dict, target_col:str = 'label', smiles_col: str='smiles'):
        self.data = pd.read_parquet(data_path)
        self.X = self.data.drop([target_col]+[smiles_col], axis=1)
        self.y = self.data[target_col]
        self.model = model(**model_params).fit(self.X, self.y)
        
    def save_model(self, path):
        with open(path, 'wb') as file:
            pickle.dump(self.model, file)
        
    def predict(self, X_vector):
        preds = self.model.predict_proba(X_vector)[:,1]
        return preds
    
    def __call__(self):
        return self.model
        
        

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
data_path = f'./data/encoded_data/GRUv3_tatra/mu_d2_epoch_80.parquet'
data = pd.read_parquet(data_path)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,label,smiles
0,2.103291,0.843166,0.661608,-0.493326,-1.515839,-2.089014,0.606348,-0.469643,0.877941,-0.208975,...,-0.873071,-0.145492,1.004696,-1.31856,-0.145407,0.28508,-0.300703,0.882296,0,CCOc1ccccc1N1CCN(Cc2csc3ccccc23)CC1
1,0.084732,0.248637,-0.556829,0.009932,1.08134,-0.282894,-0.088232,-0.499152,-1.568471,1.201898,...,0.337907,0.026669,-0.503294,-1.266375,-0.828005,-1.141059,-1.180983,0.555649,1,Clc1cccc(N2CCCN(CCCOc3ccc4CCC(=O)Nc4c3)CC2)c1Cl
2,-0.397815,0.61084,-0.476965,0.347521,0.481587,-0.262544,1.504975,-0.690981,-2.350592,-0.537407,...,-0.901967,-0.225223,-2.738622,0.54964,-1.079602,-1.573063,-1.271712,1.253689,0,Clc1cccc(N2CCN(C\\C=C\\CNC(=O)c3ccc-4c(Cc5cccc...
3,0.73605,0.191576,-0.276621,0.154883,1.483294,-1.731227,0.902199,-1.820714,1.99434,-0.670644,...,-0.545237,-0.073662,-0.094587,-0.723571,-0.014981,-0.762199,0.931147,-0.096931,1,CCCCn1cc(COc2ccc(CN3CCN(CC3)c3ccccc3OC)cc2OC)nn1
4,0.324591,1.113374,0.858957,0.610884,-0.696307,-0.364246,-0.487522,1.782806,-1.641602,-1.595864,...,0.343896,0.047728,0.137577,-1.361735,-1.940879,0.435595,-1.717773,1.121779,1,O=C1N(CCCCCCN2CCN(CC2)c2cccc3ccccc23)S(=O)(=O)...


In [29]:
model_params = {'n_estimators':100, 'max_depth': 7, 'tree_method':'auto'}

XG = VectorDiscriminator(data_path = data_path, model=XGBClassifier, model_params=model_params)
XG.save_model(f'./models/XGBC.pkl')

In [3]:
import numpy as np
import pandas as pd
import pickle

In [4]:
data_path = f'./data/encoded_data/GRUv3_tatra/mu_d2_epoch_80.parquet'

In [7]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [6]:
RF_params = {'max_depth':7, 'n_estimators':200}

In [8]:
RF = RFC(**RF_params)

In [9]:
print(RF)

RandomForestClassifier(max_depth=7, n_estimators=200)


In [None]:
for file in os.scandir(f'./folder'):
    print(file.path)

In [13]:
data = pd.read_parquet(data_path)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,label,smiles
0,2.103291,0.843166,0.661608,-0.493326,-1.515839,-2.089014,0.606348,-0.469643,0.877941,-0.208975,...,-0.873071,-0.145492,1.004696,-1.31856,-0.145407,0.28508,-0.300703,0.882296,0,CCOc1ccccc1N1CCN(Cc2csc3ccccc23)CC1
1,0.084732,0.248637,-0.556829,0.009932,1.08134,-0.282894,-0.088232,-0.499152,-1.568471,1.201898,...,0.337907,0.026669,-0.503294,-1.266375,-0.828005,-1.141059,-1.180983,0.555649,1,Clc1cccc(N2CCCN(CCCOc3ccc4CCC(=O)Nc4c3)CC2)c1Cl
2,-0.397815,0.61084,-0.476965,0.347521,0.481587,-0.262544,1.504975,-0.690981,-2.350592,-0.537407,...,-0.901967,-0.225223,-2.738622,0.54964,-1.079602,-1.573063,-1.271712,1.253689,0,Clc1cccc(N2CCN(C\\C=C\\CNC(=O)c3ccc-4c(Cc5cccc...
3,0.73605,0.191576,-0.276621,0.154883,1.483294,-1.731227,0.902199,-1.820714,1.99434,-0.670644,...,-0.545237,-0.073662,-0.094587,-0.723571,-0.014981,-0.762199,0.931147,-0.096931,1,CCCCn1cc(COc2ccc(CN3CCN(CC3)c3ccccc3OC)cc2OC)nn1
4,0.324591,1.113374,0.858957,0.610884,-0.696307,-0.364246,-0.487522,1.782806,-1.641602,-1.595864,...,0.343896,0.047728,0.137577,-1.361735,-1.940879,0.435595,-1.717773,1.121779,1,O=C1N(CCCCCCN2CCN(CC2)c2cccc3ccccc23)S(=O)(=O)...


In [15]:
X_data = data.drop(['label', 'smiles'], axis=1)
y_data = data['label']

In [16]:
RF.fit(X_data, y_data)

In [18]:
with open(f'./models/RF.pkl', 'wb') as file:
    pickle.dump(RF, file)

In [19]:
with open(f'./models/RF.pkl', 'rb') as file:
    model = pickle.load(file)

In [20]:
print(model)

RandomForestClassifier(max_depth=7, n_estimators=200)


In [33]:
predictions = pd.Series(model.predict_proba(X_data)[:, 1], name='Prob')

In [32]:
print(predictions.head())

0    0.754400
1    0.246716
2    0.655308
3    0.564946
4    0.363109
Name: Prob, dtype: float64


In [34]:
from xgboost import XGBClassifier

In [35]:
model = XGBClassifier(tree_method='gpu_hist')

In [38]:
from sklearn.svm import SVC
SV = SVC(probability=True)
