In [28]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV


In [9]:
# Load data
train_3a4 = pd.read_csv('cyp3a4_inhibitor_train_desc.csv')
test_3a4 = pd.read_csv('cyp3a4_inhibitor_test_desc.csv')
val_3a4 = pd.read_csv('cyp3a4_inhibitor_val_desc.csv')

# Remove columns that contain any non-numeric values
train_3a4 = train_3a4.apply(pd.to_numeric, errors='coerce')  # convert all to numeric, set invalid strings to NaN
train_3a4 = train_3a4.dropna(axis=1)  # drop columns with any NaNs

test_3a4 = test_3a4.apply(pd.to_numeric, errors='coerce')  # convert all to numeric, set invalid strings to NaN
test_3a4 = test_3a4.dropna(axis=1)  # drop columns with any NaNs

  train_3a4 = pd.read_csv('cyp3a4_inhibitor_train_desc.csv')
  test_3a4 = pd.read_csv('cyp3a4_inhibitor_test_desc.csv')
  val_3a4 = pd.read_csv('cyp3a4_inhibitor_val_desc.csv')


In [10]:
# Split data into features and labels
X_train = train_3a4.drop(columns=['label'])
y_train = train_3a4['label']

X_test = test_3a4.drop(columns=['label'])
y_test = test_3a4['label']

In [None]:
# Random Forest Classifier
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist,
    n_iter=50, cv=3, verbose=2, n_jobs=-1, scoring='accuracy'
)

random_search.fit(X_train, y_train)
print("Best Params:", random_search.best_params_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  10.4s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  10.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   5.8s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   9.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   5.7s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time=  27.5s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total tim

In [None]:
# Build model based on best parameters
best_params = random_search.best_params_

best_rf_model = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    bootstrap=best_params['bootstrap'],
    random_state=42  
)

best_rf_model.fit(X_train, y_train)
best_rf_y_pred = best_rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, best_rf_y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, best_rf_y_pred))
print("\nClassification report:\n", classification_report(y_test, best_rf_y_pred))

Accuracy: 0.8097095795405288

Confusion matrix:
 [[1201  183]
 [ 256  667]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.82      0.87      0.85      1384
         1.0       0.78      0.72      0.75       923

    accuracy                           0.81      2307
   macro avg       0.80      0.80      0.80      2307
weighted avg       0.81      0.81      0.81      2307



RandomForestClassifiers had the highest accuracy so tuned the model.
There was not a dramatical increase, could be reaching performance ceiling.


In [26]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, lr_y_pred))
print("\nConfusion matrix:", confusion_matrix(y_test, lr_y_pred))
print("\nClassification report:", classification_report(y_test, lr_y_pred))

Accuracy: 0.681837884698743

Confusion matrix: [[1169  215]
 [ 519  404]]

Classification report:               precision    recall  f1-score   support

         0.0       0.69      0.84      0.76      1384
         1.0       0.65      0.44      0.52       923

    accuracy                           0.68      2307
   macro avg       0.67      0.64      0.64      2307
weighted avg       0.68      0.68      0.67      2307



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# knn classifier
knn_model = KNeighborsClassifier(n_neighbors=5, weights='distance')  # You can tune this number

# Fit model
knn_model.fit(X_train, y_train)

# Predict
knn_y_pred = knn_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, knn_y_pred))
print("\nClassification Report:\n", classification_report(y_test, knn_y_pred))

Accuracy: 0.7260511486779367

Confusion Matrix:
 [[1106  278]
 [ 354  569]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.80      0.78      1384
         1.0       0.67      0.62      0.64       923

    accuracy                           0.73      2307
   macro avg       0.71      0.71      0.71      2307
weighted avg       0.72      0.73      0.72      2307

