In [1]:
## import packages and data 
import pandas as pd
class_data = pd.read_csv('classification_model_data.csv')

In [2]:
class_data

Unnamed: 0.1,Unnamed: 0,naAromAtom,nAtom,nN,nO,nS,nF,BCUTw.1h,BCUTp.1l,nBondsD,...,R_TpiPCTPC,n5Ring,n6Ring,nF9Ring,nF10Ring,nFG12Ring,n6HeteroRing,VE3_D,VR1_D,class_label
0,1,18,67,1,1,0,0,15.994923,6.656434,0,...,6.373533,0,5,0,1,0,1,-17.743103,432.667502,1
1,2,24,95,3,9,0,2,18.999422,4.235250,5,...,5.225156,0,4,0,0,0,0,-11.027942,676.229552,1
2,3,18,101,3,9,0,2,18.999422,4.235268,5,...,4.295698,0,4,0,0,0,0,-11.027942,676.229552,1
3,4,18,101,3,9,0,2,18.999422,4.235268,5,...,4.295698,0,4,0,0,0,0,-11.027942,676.229552,2
4,5,10,96,3,9,0,0,16.003008,4.686416,5,...,3.907792,0,3,0,1,0,0,-22.814477,664.521515,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6536,6537,18,43,5,3,0,0,34.970157,5.262839,2,...,6.528826,1,4,0,2,1,3,-3.209592,324.625997,1
6537,6538,18,56,4,3,0,0,15.996941,5.125397,2,...,9.709851,1,5,0,2,1,4,-6.752213,738.417527,1
6538,6539,18,56,4,3,0,0,15.996941,5.125373,2,...,9.709851,1,5,0,2,1,4,-6.752213,738.417527,1
6539,6540,24,53,5,2,0,1,18.998409,4.119742,1,...,12.045330,1,5,0,2,1,4,-7.820949,565.660388,1


In [3]:
class_data.drop(columns='Unnamed: 0', inplace=True)

In [4]:
class_data['class_label'].replace({1:0,2:1}, inplace=True)

In [5]:
X = class_data.drop(columns='class_label')
y = class_data['class_label']

In [6]:
y.value_counts()

0    5590
1     951
Name: class_label, dtype: int64

In [7]:
## Split data into train and test sets. 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,stratify=y, random_state=44)

## Random Forest Classifier - Classification

In [8]:
## First run a random forest model.
from sklearn.ensemble import RandomForestClassifier

In [9]:
forest = RandomForestClassifier(n_estimators=150, random_state=42)

In [10]:
forest.fit(X_train, y_train)

RandomForestClassifier(n_estimators=150, random_state=42)

In [11]:
predictions = forest.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

In [13]:
accuracy_score(y_test, predictions)

0.9284841075794621

In [14]:
### The random Forest achieves a 93% accuracy score on the classification dataset. 
### Issues: this is only a binary classification, with severe class imbalance.

In [15]:
confusion_matrix(y_test, predictions)

array([[1372,   26],
       [  91,  147]], dtype=int64)

In [16]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1398
           1       0.85      0.62      0.72       238

    accuracy                           0.93      1636
   macro avg       0.89      0.80      0.84      1636
weighted avg       0.92      0.93      0.92      1636



In [17]:
## Classification report shows a better diagnosis of the model. By analyzing the recall score, the recall score of the 
## minority class is concerning. The model was only able to recognize 62% of inactive compunds. To remedy this issue, lets
## implemenet SMOTE analysis to generate sample of the minority class. 

In [18]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train,y_train)

In [19]:
forest_2 = RandomForestClassifier(n_estimators=150, random_state=42)
forest_2.fit(X_train_res, y_train_res)

RandomForestClassifier(n_estimators=150, random_state=42)

In [20]:
y_train_res.value_counts() ## New dataset is now balanced.

1    4192
0    4192
Name: class_label, dtype: int64

In [21]:
predictions = forest_2.predict(X_test)

In [22]:
confusion_matrix(y_test, predictions)

array([[1340,   58],
       [  71,  167]], dtype=int64)

In [23]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1398
           1       0.74      0.70      0.72       238

    accuracy                           0.92      1636
   macro avg       0.85      0.83      0.84      1636
weighted avg       0.92      0.92      0.92      1636



In [24]:
### Even though the model has slightly lower precision, the recall for the minority class is much improved, which means
### this model was betetr at recognizing the minorty label. 

## Logistic Regression - Classification

In [25]:
## next model is the logistic regression.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr.fit(X_train,y_train)

LogisticRegression(random_state=42)

In [26]:
predictions = lr.predict(X_test)

In [27]:
accuracy_score(y_test, predictions)

0.8545232273838631

In [28]:
## The logistic regression achieved an accuracy score of 85%

In [29]:
confusion_matrix(y_test, predictions)

array([[1397,    1],
       [ 237,    1]], dtype=int64)

In [30]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1398
           1       0.50      0.00      0.01       238

    accuracy                           0.85      1636
   macro avg       0.68      0.50      0.46      1636
weighted avg       0.80      0.85      0.79      1636



In [31]:
## now implement SMOTE analysis.

In [32]:
lr_2 = LogisticRegression(random_state=42)
lr_2.fit(X_train_res,y_train_res)

LogisticRegression(random_state=42)

In [33]:
predictions = lr_2.predict(X_test)

In [34]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1398
           1       0.39      0.16      0.23       238

    accuracy                           0.84      1636
   macro avg       0.63      0.56      0.57      1636
weighted avg       0.80      0.84      0.81      1636



In [35]:
### We still see very poor performance using the Logistic Regression model. The minority class 

In [36]:
confusion_matrix(y_test, predictions)

array([[1338,   60],
       [ 200,   38]], dtype=int64)

## Support Vector Machines - Classification

In [37]:
## next model: Support Vector Machines 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
clf = make_pipeline(StandardScaler(), SVC(kernel='poly', random_state=42))
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)

In [38]:
accuracy_score(y_test, predictions)

0.9083129584352079

In [39]:
## The support Vector Machine achieved an accuracy score of 91%. 

In [40]:
confusion_matrix(y_test, predictions)

array([[1378,   20],
       [ 130,  108]], dtype=int64)

In [41]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1398
           1       0.84      0.45      0.59       238

    accuracy                           0.91      1636
   macro avg       0.88      0.72      0.77      1636
weighted avg       0.90      0.91      0.90      1636



In [42]:
## Still very poor scores for the minority class using the Support Vector Machine. lets rety using a complete dataset.

In [43]:
clf_2 = make_pipeline(StandardScaler(), SVC(kernel='poly', random_state=42))
clf_2.fit(X_train_res,y_train_res)
predictions = clf_2.predict(X_test)

In [44]:
confusion_matrix(y_test, predictions)

array([[1287,  111],
       [  83,  155]], dtype=int64)

In [45]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1398
           1       0.58      0.65      0.62       238

    accuracy                           0.88      1636
   macro avg       0.76      0.79      0.77      1636
weighted avg       0.89      0.88      0.88      1636



In [46]:
## We see whne using a complete dataset, teh model is better able to recognize the minority class without loosing too much 
## informaton from the majority class. But the low precision still shows that the model may be making too many false
## predictions. 

In [47]:
## Final model: Feed Forward Neural Network 

## Feed Forward Neural Network - Classification

In [48]:
### Import model packages 
from keras.models import Sequential
from keras.layers import Dense

In [49]:
### Creat model. 
def deep_learning_model():
    model = Sequential()
    model.add(Dense(120, input_dim=32, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(40, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    ## Compile model
    model.compile(loss='binary_crossentropy',
                 optimizer='adam')
    return model

In [50]:
test = deep_learning_model()
test.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 120)               3960      
_________________________________________________________________
dense_1 (Dense)              (None, 60)                7260      
_________________________________________________________________
dense_2 (Dense)              (None, 40)                2440      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 41        
Total params: 13,701
Trainable params: 13,701
Non-trainable params: 0
_________________________________________________________________


In [51]:
X_train.shape

(4905, 32)

In [52]:
test.fit(X_train, y_train, epochs=100, verbose=1)

<tensorflow.python.keras.callbacks.History at 0x1cd48d2b5f8>

In [53]:
predictions = test.predict(X_test)

In [54]:
from sklearn.metrics import recall_score, confusion_matrix, balanced_accuracy_score
recall = recall_score(y_test, predictions.round())
recall

0.4117647058823529

In [55]:
matrix = confusion_matrix(y_test, predictions.round())
matrix

array([[1358,   40],
       [ 140,   98]], dtype=int64)

In [56]:
score = balanced_accuracy_score(y_test, predictions.round())
score

0.691576201295969

In [59]:
from sklearn.metrics import classification_report
class_report = classification_report(y_test, predictions.round())
print(class_report)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1398
           1       0.71      0.41      0.52       238

    accuracy                           0.89      1636
   macro avg       0.81      0.69      0.73      1636
weighted avg       0.88      0.89      0.88      1636

