In [None]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, cohen_kappa_score, classification_report)
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib

# Load data
train_file_path = "your path"
data = pd.read_excel(train_file_path)

# Define parameters and target
parameters = data.columns[:-1]  # All columns except the last one
target = data.columns[-1]        # The last column as target

# Step 1: Remove rows with NaN values
initial_data_count = data.shape[0]
data = data.dropna(subset=parameters)
nan_removed_data_count = data.shape[0]
removed_rows = initial_data_count - nan_removed_data_count

# Step 2: Preprocess the data
X = data[parameters]
y = data[target]

# Step 3: Balance the dataset based on the target value
print("Initial class distribution before balancing:")
print(y.value_counts())

# Random Undersampling
sampler = RandomUnderSampler(random_state=42)

# Apply the sampler to balance the dataset
X_balanced, y_balanced = sampler.fit_resample(X, y)

print("Class distribution after balancing:")
print(y_balanced.value_counts())

balanced_data_count = X_balanced.shape[0]
print(f"Number of samples after balancing: {balanced_data_count}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define classifiers with their hyperparameter grids
classifiers = [
    ('Logistic Regression', LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
     {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2', None], 'solver': ['lbfgs']}),

    ('Random Forest', RandomForestClassifier(class_weight='balanced', random_state=42),
     {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [30,35,45],
      'min_samples_leaf': [25,50,100,150,200], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}),

    ('Gradient Boosting', GradientBoostingClassifier(random_state=42),
     {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 10],
      'criterion': ['friedman_mse', 'squared_error']}),

    ('Support Vector Machine (Non-linear)', SVC(class_weight='balanced', probability=True, random_state=42),
     {'C': [0.1, 1, 10], 'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'],
      'cache_size': [200, 400, 600], 'degree': [2, 3, 4]}),
]

# Create DataFrames to store feature-related information and results
feature_info_df = pd.DataFrame(columns=['Classifier', 'Parameter', 'Importance'])
results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'Specificity', 'F1-Score', 'ROC-AUC', 'Cohen\'s Kappa', 'Best Parameters', 'Considered Hyperparameters'])

# Loop through classifiers for model training, evaluation, and feature-related information
for name, classifier, param_grid in classifiers:
    print(f"Training and evaluating {name}...")

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='f1', cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), n_jobs=-1, error_score='raise')
    grid_search.fit(X_train_scaled, y_train)

    # Get the best model from GridSearchCV
    best_classifier = grid_search.best_estimator_

    # Extract feature-related information
    if hasattr(best_classifier, 'coef_'):
        coefficients = best_classifier.coef_[0]
        feature_names = X_train.columns
        coef_dict = dict(zip(feature_names, coefficients))
        sorted_coef = sorted(coef_dict.items(), key=lambda x: abs(x[1]), reverse=True)

        # Save coefficients to the DataFrame
        coef_df = pd.DataFrame({
            'Classifier': [name] * len(sorted_coef),
            'Parameter': [param for param, _ in sorted_coef],
            'Importance': [importance for _, importance in sorted_coef],
        })
        feature_info_df = pd.concat([feature_info_df, coef_df], ignore_index=True)

    elif hasattr(best_classifier, 'feature_importances_'):  # For Random Forest and Gradient Boosting
        importances = best_classifier.feature_importances_
        feature_names = X_train.columns
        importance_dict = dict(zip(feature_names, importances))
        sorted_importance = sorted(importance_dict.items(), key=lambda x: abs(x[1]), reverse=True)

        # Save importances to the DataFrame
        importances_df = pd.DataFrame({
            'Classifier': [name] * len(sorted_importance),
            'Parameter': [param for param, _ in sorted_importance],
            'Importance': [importance for _, importance in sorted_importance],
        })
        feature_info_df = pd.concat([feature_info_df, importances_df], ignore_index=True)

    # Predictions with probabilities or decision function for non-probabilistic classifiers
    if hasattr(best_classifier, "predict_proba"):
        y_pred_proba = best_classifier.predict_proba(X_test_scaled)[:, 1]
    elif hasattr(best_classifier, "decision_function"):
        y_pred_proba = best_classifier.decision_function(X_test_scaled)
    else:
        y_pred_proba = best_classifier.predict(X_test_scaled)

    # Adjust threshold for a balance between precision and recall
    threshold = 0.5
    y_pred_adjusted = (y_pred_proba > threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_adjusted)
    precision = precision_score(y_test, y_pred_adjusted)
    recall = recall_score(y_test, y_pred_adjusted)
    specificity = recall_score(y_test, y_pred_adjusted, pos_label=0)
    f1 = f1_score(y_test, y_pred_adjusted)
    roc_auc = roc_auc_score(y_test, y_pred_adjusted)
    kappa = cohen_kappa_score(y_test, y_pred_adjusted)

    # Print classification report
    report = classification_report(y_test, y_pred_adjusted)
    print(f"Classification Report for {name}:\n{report}")

    # Store results in DataFrame with considered hyperparameters
    results_df = pd.concat([results_df, pd.DataFrame({
        'Classifier': [name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'Specificity': [specificity],
        'F1-Score': [f1],
        'ROC-AUC': [roc_auc],
        'Cohen\'s Kappa': [kappa],
        'Best Parameters': [grid_search.best_params_],
        'Considered Hyperparameters': [param_grid]
    })], ignore_index=True)

# Save the feature importances and coefficients to an Excel file
feature_info_excel_path = "C:/Users/user/Desktop/feature_info.xlsx"
feature_info_df.to_excel(feature_info_excel_path, index=False)

# Convert 'F1-Score' and 'Cohen\'s Kappa' columns to numeric
results_df['F1-Score'] = pd.to_numeric(results_df['F1-Score'], errors='coerce')
results_df['Cohen\'s Kappa'] = pd.to_numeric(results_df['Cohen\'s Kappa'], errors='coerce')

# Identify the best model based on F1-Score
if 'F1-Score' in results_df.columns and not results_df['F1-Score'].isnull().all():
    best_model_index = results_df['F1-Score'].idxmax()
    best_model_name = results_df.loc[best_model_index, 'Classifier']
    best_model_params = results_df.loc[best_model_index, 'Best Parameters']
    
    for name, classifier, param_grid in classifiers:
        if name == best_model_name:
            best_model = classifier
            break
    
    # Save results to an Excel file
    excel_file_path = "C:/Users/user/Desktop/Feature Impotances.xlsx"
    results_df.to_excel(excel_file_path, index=False)

    # Save the best model
    joblib.dump(best_model, "best_model.pkl")

print("Model training and evaluation completed.")


Initial class distribution before balancing:
Cannibalized
0    67017
1     3160
Name: count, dtype: int64
Class distribution after balancing:
Cannibalized
0    3160
1    3160
Name: count, dtype: int64
Number of samples after balancing: 6320
Training and evaluating Logistic Regression...


  feature_info_df = pd.concat([feature_info_df, coef_df], ignore_index=True)
  results_df = pd.concat([results_df, pd.DataFrame({


Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       650
           1       0.77      0.75      0.76       614

    accuracy                           0.77      1264
   macro avg       0.77      0.77      0.77      1264
weighted avg       0.77      0.77      0.77      1264

Training and evaluating Random Forest...
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       650
           1       0.78      0.76      0.77       614

    accuracy                           0.78      1264
   macro avg       0.78      0.78      0.78      1264
weighted avg       0.78      0.78      0.78      1264

Training and evaluating Gradient Boosting...
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.77      0.80      0.79       650
           1      