In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Read data

In [2]:
bioassay = pd.read_csv('AID_1706_datatable_all.csv.gz',compression='gzip',skiprows=range(1,5))

In [3]:
bioassay

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Inhibition
0,1,842121,6603008,Inactive,0,,,-1.55
1,2,842122,6602571,Inactive,2,,,1.83
2,3,842123,6602616,Inactive,0,,,-3.07
3,4,842124,644371,Inactive,0,,,-1.06
4,5,842125,6603132,Inactive,0,,,0.35
...,...,...,...,...,...,...,...,...
290888,290889,51090805,24892704,Inactive,3,,,3.10
290889,290890,51090809,24892705,Inactive,5,,,4.06
290890,290891,51090810,18554311,Inactive,0,,,0.33
290891,290892,51090820,7192900,Inactive,0,,,-2.17


In [4]:
scores = pd.read_csv('DockFlow_vina.csv.gz',compression='gzip')
scores.sort_values(['id','vina_score'], inplace=True) # make sure that the compounds are ordered
for SF in ['smina']:
    df = pd.read_csv(f'ScoreFlow_{SF}.csv.gz',compression='gzip')
    df.sort_values(['id', SF+'_score'], inplace=True) # make sure that the compounds are ordered
    scores = pd.merge(scores,df,on=['id','pose'])

In [5]:
scores.rename({'id': 'PUBCHEM_SID'}, axis='columns', inplace=True)

In [6]:
# Merge what really matters.
data = pd.merge(bioassay[['PUBCHEM_SID','PUBCHEM_ACTIVITY_OUTCOME','Inhibition']], scores)
data

Unnamed: 0,PUBCHEM_SID,PUBCHEM_ACTIVITY_OUTCOME,Inhibition,pose,vina_score,smina_score
0,842134,Inactive,0.41,1,-6.5,-6.68219
1,842134,Inactive,0.41,2,-6.3,-6.33347
2,842134,Inactive,0.41,3,-6.3,-6.63991
3,842134,Inactive,0.41,4,-6.1,-6.69704
4,842134,Inactive,0.41,5,-6.1,-6.39755
...,...,...,...,...,...,...
2723367,51090821,Inactive,0.19,6,-5.9,-6.36111
2723368,51090821,Inactive,0.19,7,-5.7,-5.84297
2723369,51090821,Inactive,0.19,8,-5.4,-5.63190
2723370,51090821,Inactive,0.19,9,-5.3,-5.37468


In [7]:
top = data.query('pose == 1')

In [8]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [9]:
X = top[['vina_score','smina_score']].values
y = pd.Categorical(top['PUBCHEM_ACTIVITY_OUTCOME']).codes # 1 para inativo, 0 para ativo

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# Dummy classifier

In [11]:
from sklearn.dummy import DummyClassifier

In [12]:
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [13]:
# checking unique labels
print('Unique predicted labels: ', (np.unique(dummy_pred)))

# checking accuracy
print('Test score: ', metrics.accuracy_score(y_test, dummy_pred))

Unique predicted labels:  [1]
Test score:  0.9985315280918676


In [14]:
def print_metrics(y_test, y_pred):
    print(f'accuracy_score: {metrics.accuracy_score(y_test, y_pred)}')
    print(f'f1_score: {metrics.f1_score(y_test, y_pred)}')
    print(f'recall_score: {metrics.recall_score(y_test, y_pred)}')

# Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [17]:
lr_pred = lr.predict(X_test)

In [18]:
predictions = pd.DataFrame(lr_pred)
predictions[0].value_counts()

1    68098
Name: 0, dtype: int64

In [19]:
print_metrics(y_test, lr_pred)

accuracy_score: 0.9985315280918676
f1_score: 0.9992652245473783
recall_score: 1.0


# Random forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

In [22]:
rfc_pred = rfc.predict(X_test)

In [23]:
print_metrics(y_test, rfc_pred)

accuracy_score: 0.9976210755088255
f1_score: 0.9988091212490995
recall_score: 0.9990882084767199


In [24]:
def predict(resampled):
    y_train = resampled['PUBCHEM_ACTIVITY_OUTCOME']
    X_train = resampled.drop('PUBCHEM_ACTIVITY_OUTCOME', axis=1)
    
    resampled = LogisticRegression(solver='liblinear').fit(X_train, y_train)
    resampled_pred = resampled.predict(X_test)
    
    return resampled_pred

# Oversample minority class

In [25]:
from sklearn.utils import resample

In [26]:
X = pd.concat([pd.DataFrame(X_train, columns=['vina_score','smina_score']), pd.Series(y_train, name='PUBCHEM_ACTIVITY_OUTCOME')], axis=1)

In [27]:
inactive = X.query('PUBCHEM_ACTIVITY_OUTCOME == 1')
active = X.query('PUBCHEM_ACTIVITY_OUTCOME == 0')

In [28]:
new_active = resample(active, replace=True, n_samples=len(inactive), random_state=27) 

In [29]:
upsampled = pd.concat([inactive, new_active])

In [30]:
upsampled['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

1    204007
0    204007
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [31]:
upsampled_pred = predict(upsampled)
print_metrics(y_test, upsampled_pred)

accuracy_score: 0.5353754882669095
f1_score: 0.6970857427335044
recall_score: 0.535398099944116


# Undersample majority class

In [32]:
new_inactive = resample(inactive, replace=False, n_samples=len(active), random_state=27)

In [33]:
downsampled = pd.concat([new_inactive, active])

In [34]:
downsampled['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

1    285
0    285
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [35]:
downsampled_pred = predict(downsampled)
print_metrics(y_test, downsampled_pred)

accuracy_score: 0.5218361772739287
f1_score: 0.6854581634821585
recall_score: 0.521780052354481


# SMOTE (Synthetic Minority Oversampling Technique)

In [36]:
from imblearn.over_sampling import SMOTE

In [37]:
sm = SMOTE(random_state=27)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [38]:
smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [39]:
smote_pred = smote.predict(X_test)
print_metrics(y_test, smote_pred)

accuracy_score: 0.5380774765778731
f1_score: 0.6993826334601196
recall_score: 0.5381187681990647
