In [None]:
#Mounted to Google Drive
from google.colab import drive
import os
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, auc, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression


%matplotlib inline

In [None]:
# load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PPI/features_training1/features_1_20.csv')

# check the dimension of the dataset
print(data.shape)

# view the first 5 rows of the dataset
data.head()

(13440, 9)


Unnamed: 0,rmsd_l,rmsd_i,dfnat,dbsa,dnonb_e,dnonb_water,dcom_distance,dhbnum,quality
0,0.524341,0.218659,0.22,-3.861,351.554,-911.7,0.146,-5.0,native
1,0.508332,0.21299,0.25,-2.923,238.509,-1408.8,0.158,-4.0,native
2,0.5378,0.244115,0.14,-3.347,240.279,-2387.5,0.201,-3.0,native
3,0.694089,0.298283,0.17,-4.86,362.27,-2842.3,0.184,-5.0,native
4,0.610487,0.296998,0.16,-5.619,349.431,-2135.9,0.226,-4.0,native


In [None]:
data['quality'] = data['quality'].map({'native': 1, 'non-native': 0})
data.head()

Unnamed: 0,rmsd_l,rmsd_i,dfnat,dbsa,dnonb_e,dnonb_water,dcom_distance,dhbnum,quality
0,0.524341,0.218659,0.22,-3.861,351.554,-911.7,0.146,-5.0,1
1,0.508332,0.21299,0.25,-2.923,238.509,-1408.8,0.158,-4.0,1
2,0.5378,0.244115,0.14,-3.347,240.279,-2387.5,0.201,-3.0,1
3,0.694089,0.298283,0.17,-4.86,362.27,-2842.3,0.184,-5.0,1
4,0.610487,0.296998,0.16,-5.619,349.431,-2135.9,0.226,-4.0,1


In [None]:
X = data.drop('quality', axis=1)
y = data['quality']
print(X.head())

     rmsd_l    rmsd_i  dfnat   dbsa  dnonb_e  dnonb_water  dcom_distance  \
0  0.524341  0.218659   0.22 -3.861  351.554       -911.7          0.146   
1  0.508332  0.212990   0.25 -2.923  238.509      -1408.8          0.158   
2  0.537800  0.244115   0.14 -3.347  240.279      -2387.5          0.201   
3  0.694089  0.298283   0.17 -4.860  362.270      -2842.3          0.184   
4  0.610487  0.296998   0.16 -5.619  349.431      -2135.9          0.226   

   dhbnum  
0    -5.0  
1    -4.0  
2    -3.0  
3    -5.0  
4    -4.0  


In [None]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [None]:
X_train.shape, X_test.shape

((10752, 8), (2688, 8))

In [None]:
# scale the training and test data, initialize a MinMaxScaler object
scaler = MinMaxScaler()
Xtrain_scaled = scaler.fit_transform(X_train)
Xtest_scaled = scaler.transform(X_test)
Xtrain_scaled

array([[0.04918485, 0.03137249, 0.35211268, ..., 0.65746267, 0.23186683,
        0.30769231],
       [0.04150006, 0.03969497, 0.16901408, ..., 0.54316789, 0.19381688,
        0.42307692],
       [0.03157514, 0.01300768, 0.57746479, ..., 0.57910809, 0.20293302,
        0.5       ],
       ...,
       [0.09153166, 0.01707601, 0.15492958, ..., 0.74852769, 0.32183908,
        0.34615385],
       [0.0533069 , 0.02011719, 0.5915493 , ..., 0.62177034, 0.19678954,
        0.5       ],
       [0.05850623, 0.00535326, 0.53521127, ..., 0.5459043 , 0.23285771,
        0.42307692]])

In [None]:
Xtest_scaled

array([[0.03494326, 0.02490021, 0.36619718, ..., 0.57024449, 0.21581451,
        0.46153846],
       [0.01056681, 0.01245147, 0.56338028, ..., 0.58052587, 0.19579865,
        0.34615385],
       [0.04730248, 0.06291707, 0.01408451, ..., 0.66490849, 0.18034086,
        0.38461538],
       ...,
       [0.0200631 , 0.0253438 , 0.53521127, ..., 0.63577958, 0.20253666,
        0.53846154],
       [0.02618038, 0.01575285, 0.21126761, ..., 0.54246396, 0.21581451,
        0.15384615],
       [0.0410879 , 0.02621538, 0.1971831 , ..., 0.47390494, 0.20868014,
        0.26923077]])

**1. RANDOM FOREST**

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [500, 850, 1100, 1500],
    'max_depth': [None, 5, 10],                        #before None, 5, 10
    'min_samples_split': [2, 5, 10],                   #before 2, 5, 10
    'min_samples_leaf': [1, 2, 4],                     #before 1, 2, 4
    'max_features': ['auto','sqrt', 'log2']            #before 'auto', 'sqrt', 'log2'
}

# Create RandomForestClassifier
rf = RandomForestClassifier()

# Create GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Make predictions
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred)
rf_train_precision = precision_score(y_train, y_train_pred)
rf_train_recall = recall_score(y_train, y_train_pred)
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
rf_train_auc = roc_auc_score(y_train, best_rf.predict_proba(X_train)[:, 1])
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred)

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred)
rf_test_precision = precision_score(y_test, y_test_pred)
rf_test_recall = recall_score(y_test, y_test_pred)
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
rf_test_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

# Print model performance
print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- Precision: %s' % rf_train_precision)
print('- Recall: %s' % rf_train_recall)
print('- F1 score: %s' % rf_train_f1)
print('- AUC: %s' % rf_train_auc)
print('- MCC: %s' % rf_train_mcc)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- Precision: %s' % rf_test_precision)
print('- Recall: %s' % rf_test_recall)
print('- F1 score: %s' % rf_test_f1)
print('- AUC: %s' % rf_test_auc)
print('- MCC: %s' % rf_test_mcc)

In [None]:
# Define the performance metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 score', 'AUC', 'MCC']

# Training set performance values
train_values = [rf_train_accuracy, rf_train_precision, rf_train_recall, rf_train_f1, rf_train_auc, rf_train_mcc]

# Test set performance values
test_values = [rf_test_accuracy, rf_test_precision, rf_test_recall, rf_test_f1, rf_test_auc, rf_test_mcc]

# Create a bar chart
bar_width = 0.35
index = np.arange(len(metrics))

fig, ax = plt.subplots()

bar1 = ax.bar(index, train_values, bar_width, label='Training Set')
bar2 = ax.bar(index + bar_width, test_values, bar_width, label='Test Set')

# Add labels, title, and legend
ax.set_xlabel('Performance Metrics')
ax.set_ylabel('Performance Scores')
ax.set_title('Model Performance for Different Metrics')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(metrics)
ax.legend()

# Display the values on top of the bars
def autolabel(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate('%.4f' % height,
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(bar1)
autolabel(bar2)

plt.show()

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(ax, y_true, y_pred, classes, title='Confusion Matrix - Random Forest'):
    cm = confusion_matrix(y_true, y_pred)

    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title(title)
    ax.figure.colorbar(im, ax=ax)
    tick_marks = range(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    for i in range(len(classes)):
        for j in range(len(classes)):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center')

# Function to plot ROC curve
def plot_roc_curve(ax, y_true, y_score, title='ROC Curve - Random Forest'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    area_under_curve = auc(fpr, tpr)

    ax.plot(fpr, tpr, color='darkblue', lw=2, label=f'AUC = {area_under_curve:.2f}')
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right')

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Plot Confusion Matrix
plot_confusion_matrix(axs[0], y_test, y_test_pred, classes=[0, 1], title='Confusion Matrix - Random Forest')

# Plot ROC Curve
plot_roc_curve(axs[1], y_test, best_rf.predict_proba(X_test)[:, 1], title='ROC Curve - Random Forest')

plt.tight_layout()
plt.show()

**2. GRADIENT BOOSTING**

In [None]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [7, 10, 16],             #before 3, 5, 7
    'n_estimators': [400, 700],      #before 50, 100, 150
    'learning_rate': [0.2]                #before 0.01, 0.1, 0.2
}

# Create GradientBoostingClassifier
gbos = GradientBoostingClassifier(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(gbos, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_gb = grid_search.best_estimator_

# Make predictions
y_train_pred = best_gb.predict(X_train)
y_test_pred = best_gb.predict(X_test)

# Training set performance
gb_train_accuracy = accuracy_score(y_train, y_train_pred)
gb_train_precision = precision_score(y_train, y_train_pred)
gb_train_recall = recall_score(y_train, y_train_pred)
gb_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
gb_train_auc = roc_auc_score(y_train, best_rf.predict_proba(X_train)[:, 1])
gb_train_mcc = matthews_corrcoef(y_train, y_train_pred)

# Test set performance
gb_test_accuracy = accuracy_score(y_test, y_test_pred)
gb_test_precision = precision_score(y_test, y_test_pred)
gb_test_recall = recall_score(y_test, y_test_pred)
gb_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
gb_test_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
gb_test_mcc = matthews_corrcoef(y_test, y_test_pred)


# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

# Print model performance
print('Model performance for Training set')
print('- Accuracy: %s' % gb_train_accuracy)
print('- Precision: %s' % gb_train_precision)
print('- Recall: %s' % gb_train_recall)
print('- F1 score: %s' % gb_train_f1)
print('- AUC: %s' % gb_train_auc)
print('- MCC: %s' % gb_train_mcc)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % gb_test_accuracy)
print('- Precision: %s' % gb_test_precision)
print('- Recall: %s' % gb_test_recall)
print('- F1 score: %s' % gb_test_f1)
print('- AUC: %s' % gb_test_auc)
print('- MCC: %s' % gb_test_mcc)

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(ax, y_true, y_pred, classes, title='Confusion Matrix - Gradient Boosting'):
    cm = confusion_matrix(y_true, y_pred)

    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title(title)
    ax.figure.colorbar(im, ax=ax)
    tick_marks = range(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    for i in range(len(classes)):
        for j in range(len(classes)):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center')

# Function to plot ROC curve
def plot_roc_curve(ax, y_true, y_score, title='ROC Curve - Gradient Boosting'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    area_under_curve = auc(fpr, tpr)

    ax.plot(fpr, tpr, color='darkblue', lw=2, label=f'AUC = {area_under_curve:.2f}')
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right')

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Plot Confusion Matrix
plot_confusion_matrix(axs[0], y_test, y_test_pred, classes=[0, 1], title='Confusion Matrix - Gradient Boosting')

# Plot ROC Curve
plot_roc_curve(axs[1], y_test, best_gb.predict_proba(X_test)[:, 1], title='ROC Curve - Gradient Boosting')

plt.tight_layout()
plt.show()

**3. XGBoost (Extreme Gradient Boosting)**

In [None]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [7, 10, 16],
    'n_estimators': [300, 800],
    'learning_rate': [0.2, 0.3]
}

# Create XGBClassifier
xgboost_model = XGBClassifier(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(xgboost_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_xgb = grid_search.best_estimator_

# Make predictions
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

# Training set performance
xgb_train_accuracy = accuracy_score(y_train, y_train_pred)
xgb_train_precision = precision_score(y_train, y_train_pred)
xgb_train_recall = recall_score(y_train, y_train_pred)
xgb_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
xgb_train_auc = roc_auc_score(y_train, best_rf.predict_proba(X_train)[:, 1])
xgb_train_mcc = matthews_corrcoef(y_train, y_train_pred)

# Test set performance
xgb_test_accuracy = accuracy_score(y_test, y_test_pred)
xgb_test_precision = precision_score(y_test, y_test_pred)
xgb_test_recall = recall_score(y_test, y_test_pred)
xgb_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
xgb_test_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
xgb_test_mcc = matthews_corrcoef(y_test, y_test_pred)


# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

# Print model performance
print('Model performance for Training set')
print('- Accuracy: %s' % xgb_train_accuracy)
print('- Precision: %s' % xgb_train_precision)
print('- Recall: %s' % xgb_train_recall)
print('- F1 score: %s' % xgb_train_f1)
print('- AUC: %s' % xgb_train_auc)
print('- MCC: %s' % xgb_train_mcc)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % xgb_test_accuracy)
print('- Precision: %s' % xgb_test_precision)
print('- Recall: %s' % xgb_test_recall)
print('- F1 score: %s' % xgb_test_f1)
print('- AUC: %s' % xgb_test_auc)
print('- MCC: %s' % xgb_test_mcc)


In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(ax, y_true, y_pred, classes, title='Confusion Matrix - XGBoost (Extreme Gradient Boosting)'):
    cm = confusion_matrix(y_true, y_pred)

    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title(title)
    ax.figure.colorbar(im, ax=ax)
    tick_marks = range(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    for i in range(len(classes)):
        for j in range(len(classes)):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center')

# Function to plot ROC curve
def plot_roc_curve(ax, y_true, y_score, title='ROC Curve - XGBoost (Extreme Gradient Boosting)'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    area_under_curve = auc(fpr, tpr)

    ax.plot(fpr, tpr, color='darkblue', lw=2, label=f'AUC = {area_under_curve:.2f}')
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right')

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Plot Confusion Matrix
plot_confusion_matrix(axs[0], y_test, y_test_pred, classes=[0, 1], title='Confusion Matrix - XGBoost (Extreme Gradient Boosting)')

# Plot ROC Curve
plot_roc_curve(axs[1], y_test, best_xgb.predict_proba(X_test)[:, 1], title='ROC Curve - XGBoost (Extreme Gradient Boosting)')

plt.tight_layout()
plt.show()

**4. LIGHT GBM**

In [None]:
# Define the parameter grid to search
param_grid = {
    'max_depth': [12,14],
    'n_estimators': [1200, 1600],
    'learning_rate': [0.2]
}

# Create LGBMClassifier
lgbm_model = LGBMClassifier(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(lgbm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_lgbm = grid_search.best_estimator_

# Make predictions
y_train_pred = best_lgbm.predict(X_train)
y_test_pred = best_lgbm.predict(X_test)

# Training set performance
lgbm_train_accuracy = accuracy_score(y_train, y_train_pred)
lgbm_train_precision = precision_score(y_train, y_train_pred)
lgbm_train_recall = recall_score(y_train, y_train_pred)
lgbm_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
lgbm_train_auc = roc_auc_score(y_train, best_lgbm.predict_proba(X_train)[:, 1])
lgbm_train_mcc = matthews_corrcoef(y_train, y_train_pred)

# Test set performance
lgbm_test_accuracy = accuracy_score(y_test, y_test_pred)
lgbm_test_precision = precision_score(y_test, y_test_pred)
lgbm_test_recall = recall_score(y_test, y_test_pred)
lgbm_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
lgbm_test_auc = roc_auc_score(y_test, best_lgbm.predict_proba(X_test)[:, 1])
lgbm_test_mcc = matthews_corrcoef(y_test, y_test_pred)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

# Print model performance
print('Model performance for Training set')
print('- Accuracy: %s' % lgbm_train_accuracy)
print('- Precision: %s' % lgbm_train_precision)
print('- Recall: %s' % lgbm_train_recall)
print('- F1 score: %s' % lgbm_train_f1)
print('- AUC: %s' % lgbm_train_auc)
print('- MCC: %s' % lgbm_train_mcc)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % lgbm_test_accuracy)
print('- Precision: %s' % lgbm_test_precision)
print('- Recall: %s' % lgbm_test_recall)
print('- F1 score: %s' % lgbm_test_f1)
print('- AUC: %s' % lgbm_test_auc)
print('- MCC: %s' % lgbm_test_mcc)

In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(ax, y_true, y_pred, classes, title='Confusion Matrix - Light GBM'):
    cm = confusion_matrix(y_true, y_pred)

    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title(title)
    ax.figure.colorbar(im, ax=ax)
    tick_marks = range(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    for i in range(len(classes)):
        for j in range(len(classes)):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center')

# Function to plot ROC curve
def plot_roc_curve(ax, y_true, y_score, title='ROC Curve - Light GBM'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    area_under_curve = auc(fpr, tpr)

    ax.plot(fpr, tpr, color='darkblue', lw=2, label=f'AUC = {area_under_curve:.2f}')
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right')

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Plot Confusion Matrix
plot_confusion_matrix(axs[0], y_test, y_test_pred, classes=[0, 1], title='Confusion Matrix - Light GBM')

# Plot ROC Curve
plot_roc_curve(axs[1], y_test, best_lgbm.predict_proba(X_test)[:, 1], title='ROC Curve - Light GBM')

plt.tight_layout()
plt.show()

**5. BUILD STACKED MODEL**

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define base estimators
estimator_list = [
    ('Random Forest', rf),
    ('Gradient Boosting', gbos),
    ('XGBoost (Extreme Gradient Boosting)', xgboost_model),
    ('Light GBM', lgbm_model)
]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list,
    final_estimator=LogisticRegression()
)

# Fit the stack model to the data
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred)
stack_model_train_precision = precision_score(y_train, y_train_pred)
stack_model_train_recall = recall_score(y_train, y_train_pred)
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
stack_model_train_auc = roc_auc_score(y_test, stack_model.predict_proba(X_test)[:, 1])
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred)

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred)  # Calculate Accuracy
stack_model_test_precision = precision_score(y_test, y_test_pred)
stack_model_test_recall = recall_score(y_test, y_test_pred)
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
stack_model_test_auc = roc_auc_score(y_test, stack_model.predict_proba(X_test)[:, 1])
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred)

# Display the model performance
print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- Precision: %s' % stack_model_train_precision)
print('- Recall: %s' % stack_model_train_recall)
print('- F1 score: %s' % stack_model_train_f1)
print('- AUC: %s' % stack_model_train_auc)
print('- MCC: %s' % stack_model_train_mcc)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- Precision: %s' % stack_model_test_precision)
print('- Recall: %s' % stack_model_test_recall)
print('- F1 score: %s' % stack_model_test_f1)
print('- AUC: %s' % stack_model_test_auc)
print('- MCC: %s' % stack_model_test_mcc)


In [None]:
# Save the model
joblib.dump(stack_model, '/content/drive/MyDrive/Colab Notebooks/PPI/RESULT/stack_model_80_100.pkl')

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

# Assuming you have defined stack_model somewhere in your code

# Fit the stacking classifier on your training data
stack_model.fit(X_train, y_train)

# Function to plot confusion matrix
def plot_confusion_matrix(ax, y_true, y_pred, classes, title='Confusion Matrix - Stack Model'):
    cm = confusion_matrix(y_true, y_pred)

    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title(title)
    plt.colorbar(im, ax=ax)  # Corrected line
    tick_marks = range(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    ax.set_xlabel('Predicted Label')
    ax.set_ylabel('True Label')

    for i in range(len(classes)):
        for j in range(len(classes)):
            ax.text(j, i, str(cm[i, j]), ha='center', va='center')

# Function to plot ROC curve
def plot_roc_curve(ax, y_true, y_score, title='ROC Curve - Stack Model'):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    area_under_curve = auc(fpr, tpr)

    ax.plot(fpr, tpr, color='darkblue', lw=2, label=f'AUC = {area_under_curve:.2f}')
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title(title)
    ax.legend(loc='lower right')

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot Confusion Matrix
plot_confusion_matrix(axs[0], y_test, stack_model.predict(X_test), classes=[0, 1], title='Confusion Matrix - Stack Model')

# Plot ROC Curve
plot_roc_curve(axs[1], y_test, stack_model.predict_proba(X_test)[:, 1], title='ROC Curve - Stack Model')

plt.tight_layout()
plt.show()

**RESULT**

In [None]:
acc_train_list = {'Random Forest': rf_train_accuracy,
'Gradient Boosting': gb_train_accuracy,
'XGBoost (Extreme Gradient Boosting)': xgb_train_accuracy,
'Light GBM':lgbm_train_accuracy,
'stack': stack_model_train_accuracy}

acc_test_list = {'Random Forest': rf_test_accuracy,
'Gradient Boosting': gb_test_accuracy,
'XGBoost (Extreme Gradient Boosting)': xgb_test_accuracy,
'Light GBM':lgbm_test_accuracy,
'stack': stack_model_test_accuracy}

pre_train_list = {'Random Forest': rf_train_precision,
'Gradient Boosting': gb_train_precision,
'XGBoost (Extreme Gradient Boosting)': xgb_train_precision,
'Light GBM':lgbm_train_precision,
'stack': stack_model_train_precision}

pre_test_list = {'Random Forest': rf_test_precision,
'Gradient Boosting': gb_test_precision,
'XGBoost (Extreme Gradient Boosting)': xgb_test_precision,
'Light GBM':lgbm_test_precision,
'stack': stack_model_test_precision}

rec_train_list = {'Random Forest': rf_train_recall,
'Gradient Boosting': gb_train_recall,
'XGBoost (Extreme Gradient Boosting)': xgb_train_recall,
'Light GBM':lgbm_train_recall,
'stack': stack_model_train_recall}

rec_test_list = {'Random Forest': rf_test_f1,
'Gradient Boosting': gb_test_f1,
'XGBoost (Extreme Gradient Boosting)': xgb_test_recall,
'Light GBM':lgbm_test_recall,
'stack': stack_model_test_recall}

f1_train_list = {'Random Forest': rf_train_f1,
'Gradient Boosting': gb_train_f1,
'XGBoost (Extreme Gradient Boosting)': xgb_train_f1,
'Light GBM':lgbm_train_f1,
'stack': stack_model_train_f1}

f1_test_list = {'Random Forest': rf_test_f1,
'Gradient Boosting': gb_test_f1,
'XGBoost (Extreme Gradient Boosting)': xgb_test_f1,
'Light GBM':lgbm_test_f1,
'stack': stack_model_test_f1}

roc_train_list = {'Random Forest': rf_train_auc,
'Gradient Boosting': gb_train_auc,
'XGBoost (Extreme Gradient Boosting)': xgb_train_auc,
#'Cat Boosting':cb_train_auc,
'Light GBM':lgbm_train_auc,
'stack': stack_model_train_auc}

roc_test_list = {'Random Forest': rf_test_auc,
'Gradient Boosting': gb_test_auc,
'XGBoost (Extreme Gradient Boosting)': xgb_test_auc,
'Light GBM':lgbm_test_auc,
'stack': stack_model_test_auc}

mcc_train_list = {'Random Forest': rf_train_mcc,
'Gradient Boosting': gb_train_mcc,
'XGBoost (Extreme Gradient Boosting)': xgb_train_mcc,
'Light GBM':lgbm_train_mcc,
'stack': stack_model_train_mcc}

mcc_test_list = {'Random Forest': rf_test_mcc,
'Gradient Boosting': gb_test_mcc,
'XGBoost (Extreme Gradient Boosting)': xgb_test_mcc,
'Light GBM':lgbm_test_mcc,
'stack': stack_model_test_mcc}



In [None]:
mcc_train_list

In [None]:
acc_train_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy Train'])
acc_test_df = pd.DataFrame.from_dict(acc_test_list, orient='index', columns=['Accuracy Test'])
pre_train_df = pd.DataFrame.from_dict(pre_train_list, orient='index', columns=['Precision  Train'])
pre_test_df = pd.DataFrame.from_dict(pre_test_list, orient='index', columns=['Precision Test'])
rec_train_df = pd.DataFrame.from_dict(rec_train_list, orient='index', columns=['Recall Train'])
rec_test_df = pd.DataFrame.from_dict(rec_test_list, orient='index', columns=['Recall Test'])
f1_train_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1 Train'])
f1_test_df = pd.DataFrame.from_dict(f1_test_list, orient='index', columns=['F1 Test'])
roc_train_df = pd.DataFrame.from_dict(roc_train_list, orient='index', columns=['AUC Train'])
roc_test_df = pd.DataFrame.from_dict(roc_test_list, orient='index', columns=['AUC Test'])
mcc_train_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC Train'])
mcc_test_df = pd.DataFrame.from_dict(mcc_test_list, orient='index', columns=['MCC Test'])
df = pd.concat([acc_train_df, acc_test_df, pre_train_df, pre_test_df, rec_train_df, rec_test_df, f1_train_df, f1_test_df, roc_train_df, roc_test_df, mcc_train_df, mcc_test_df], axis=1)
df

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/PPI/RESULT/results_60_80.csv')

In [None]:
# Plotting bar chart with wider bars
ax = df.plot(kind='bar', figsize=(20, 12), width=0.8)  # Adjust the width value as needed

# Display values on top of each bar with a gap
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', label_type='edge', fontsize=8, color='black', rotation=90, padding=5)  # Adjust the padding value as needed

plt.title('Comparison of Metrics')
plt.xlabel('Models')
plt.ylabel('Metric Values')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Show the plot
plt.show()
