In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv("datasets/target_molecules_clean.csv")
df.head()

Unnamed: 0,pIC50,ECFP4_0,ECFP4_1,ECFP4_2,ECFP4_3,ECFP4_4,ECFP4_5,ECFP4_6,ECFP4_7,ECFP4_8,...,ECFP4_2038,ECFP4_2039,ECFP4_2040,ECFP4_2041,ECFP4_2042,ECFP4_2043,ECFP4_2044,ECFP4_2045,ECFP4_2046,ECFP4_2047
0,-0.30103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.30103,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,-0.30103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.531653,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,4.337242,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Binning pIC50 values into active (1) and inactive (0) classes
threshold = 5.0
df['activity_class'] = (df['pIC50'] >= threshold).astype(int)

# Data Splitting Strategy
train_data, temp_data = train_test_split(df, test_size=0.3, stratify=df['activity_class'], random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['activity_class'], random_state=42)

X_train_class = train_data.iloc[:, 1:-2]  # Exclude the pIC50 and activity_class columns
y_train_class = train_data['activity_class']

X_valid_class = valid_data.iloc[:, 1:-2]
y_valid_class = valid_data['activity_class']

df.head()



Unnamed: 0,pIC50,ECFP4_0,ECFP4_1,ECFP4_2,ECFP4_3,ECFP4_4,ECFP4_5,ECFP4_6,ECFP4_7,ECFP4_8,...,ECFP4_2039,ECFP4_2040,ECFP4_2041,ECFP4_2042,ECFP4_2043,ECFP4_2044,ECFP4_2045,ECFP4_2046,ECFP4_2047,activity_class
0,-0.30103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.30103,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,-0.30103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.531653,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,4.337242,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")

# Train the classifier
xgb_classifier.fit(X_train_class, y_train_class)

# Predict on the validation set
y_pred_class_xgb = xgb_classifier.predict(X_valid_class)

# Calculate performance metrics
accuracy_xgb = accuracy_score(y_valid_class, y_pred_class_xgb)
classification_rep_xgb = classification_report(y_valid_class, y_pred_class_xgb)

# Convert the classification report to a dictionary
report_dict_xgb = classification_report(y_valid_class, y_pred_class_xgb, output_dict=True)

# Convert the dictionary to a pandas DataFrame
report_df_xgb = pd.DataFrame(report_dict_xgb).transpose()

accuracy_xgb, report_df_xgb



(0.8247863247863247,
               precision    recall  f1-score     support
 0              0.842391  0.928144  0.883191  167.000000
 1              0.760000  0.567164  0.649573   67.000000
 accuracy       0.824786  0.824786  0.824786    0.824786
 macro avg      0.801196  0.747654  0.766382  234.000000
 weighted avg   0.818801  0.824786  0.816300  234.000000)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8, 10],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'n_estimators': [50, 100, 150, 200],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")

# Randomized search
random_search = RandomizedSearchCV(
    xgb_classifier,
    param_distributions=param_dist,
    scoring='accuracy',
    n_iter=10,  # Number of parameter settings that are sampled
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Fit the model
random_search.fit(X_train_class, y_train_class)

# Get the best parameters
best_params = random_search.best_params_

best_params


In [14]:
# Training the XGBoost model with the optimized hyperparameters
xgb_optimized = xgb.XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_optimized.fit(X_train_class, y_train_class)

# Predicting on the validation set
y_pred_class_optimized = xgb_optimized.predict(X_valid_class)

# Calculating performance metrics
accuracy_xgb_optimized = accuracy_score(y_valid_class, y_pred_class_optimized)
classification_rep_xgb_optimized = classification_report(y_valid_class, y_pred_class_optimized)

# Convert the classification report to a dictionary
report_dict_xgb_optimized = classification_report(y_valid_class, y_pred_class_optimized, output_dict=True)

# Convert the dictionary to a pandas DataFrame
report_df_xgb_optimized = pd.DataFrame(report_dict_xgb_optimized).transpose()

accuracy_xgb_optimized, report_df_xgb_optimized




(0.8333333333333334,
               precision    recall  f1-score     support
 0              0.844086  0.940120  0.889518  167.000000
 1              0.791667  0.567164  0.660870   67.000000
 accuracy       0.833333  0.833333  0.833333    0.833333
 macro avg      0.817876  0.753642  0.775194  234.000000
 weighted avg   0.829077  0.833333  0.824051  234.000000)