In [1]:
import rdkit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools

In [2]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv('D:\\chemoinformatics\\serotonin1a.csv')

In [51]:
df['Ki (nM)'] = pd.to_numeric(df['Ki (nM)'],errors = 'coerce')

In [54]:
df.dropna(subset = ['Ki (nM)'], inplace = True)
df.reset_index(inplace = True)
df.drop('index',axis = 1)

In [122]:
smiles = df['Ligand SMILES']

In [123]:
Ki = df['Ki (nM)']

In [124]:
Ki.max()

436515832.2

In [125]:
Ki.min()

0.000178

In [126]:
len(Ki)

7292

In [302]:
Morgan_matrix = np.zeros((1,2048))
l = len(smiles)
for i in range(l):
    
    compound = Chem.MolFromSmiles(smiles[i])
    fp = Chem.AllChem.GetMorganFingerprintAsBitVect(compound, 2, nBits = 2048)
    fp = fp.ToBitString()
    matrix_row = np.array([int(x) for x in list(fp)])
    Morgan_matrix = np.row_stack((Morgan_matrix, matrix_row))
    
    if i%500==0:
        perc = np.round(100* (i/l),1)
        
        print(f"{perc}% done")

Morgan_matrix = np.delete(Morgan_matrix,0,axis = 0)

0.0% done
6.9% done
13.7% done
20.6% done
27.4% done
34.3% done
41.1% done
48.0% done
54.9% done
61.7% done
68.6% done
75.4% done
82.3% done
89.1% done
96.0% done


In [303]:
Morgan_matrix.shape

(7292, 2048)

In [304]:
from sklearn.ensemble import RandomForestClassifier

In [305]:
from sklearn.ensemble import RandomForestRegressor

In [306]:
from sklearn.naive_bayes import BernoulliNB

In [307]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [311]:
reg = RandomForestRegressor(n_estimators = 10)

In [309]:
X_train, X_test, y_train, y_test = train_test_split(Morgan_matrix,Ki, random_state = 3)

In [312]:
reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [313]:
y_pred = reg.predict(X_test)

In [335]:
df2 = y_test.reset_index()

In [336]:
df2['pred'] = y_pred

In [381]:
df2.head(50)

Unnamed: 0,index,Ki (nM),pred
0,2152,2000.0,477.656
1,6743,69.7,2305.438
2,3455,0.2,0.796
3,1201,397.9,1012.439
4,467,180.0,431.464571
5,4830,13.0,664.8191
6,1173,1.0,13.344333
7,529,192.0,143.694
8,7017,90.0,318.360234
9,7117,99.0,52.86


In [374]:
tests = df2['Ki (nM)'] 
preds = df2['pred']

In [375]:
tests = tests > 5

In [376]:
preds = preds > 5

In [377]:
cm = confusion_matrix(tests, preds)

In [378]:
print(cm)

[[ 120  288]
 [  34 1381]]


In [379]:
print(classification_report(tests, preds))

              precision    recall  f1-score   support

       False       0.78      0.29      0.43       408
        True       0.83      0.98      0.90      1415

   micro avg       0.82      0.82      0.82      1823
   macro avg       0.80      0.64      0.66      1823
weighted avg       0.82      0.82      0.79      1823

