### Dataset

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


#### the libraries

In [169]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import adjusted_rand_score, silhouette_score
import warnings

In [258]:
# Baseline results (from your current models)
baseline_results = []

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    baseline_results.append([name, acc, prec, rec, f1, auc])

# Convert to DataFrame
baseline_df = pd.DataFrame(baseline_results, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"])
print("📊 Baseline Model Performance:")
print(baseline_df.round(3))

📊 Baseline Model Performance:
                 Model  Accuracy  Precision  Recall     F1    AUC
0  Logistic Regression     0.885      0.862   0.893  0.877  0.953
1        Decision Tree     0.770      0.719   0.821  0.767  0.774
2        Random Forest     0.869      0.833   0.893  0.862  0.938
3                  SVM     0.902      0.923   0.857  0.889  0.955


In [264]:
from sklearn.model_selection import GridSearchCV

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # Required for l1 penalty
}

lr = LogisticRegression(random_state=42)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='roc_auc', n_jobs=-1)
grid_lr.fit(X_train, y_train)

# Best model
best_lr = grid_lr.best_estimator_
print("✅ Best Logistic Regression:", grid_lr.best_params_)

✅ Best Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [265]:
from sklearn.model_selection import RandomizedSearchCV

param_dist_dt = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(1, 10),
    'criterion': ['gini', 'entropy']
}

dt = DecisionTreeClassifier(random_state=42)
random_dt = RandomizedSearchCV(dt, param_dist_dt, n_iter=100, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_dt.fit(X_train, y_train)

best_dt = random_dt.best_estimator_
print("✅ Best Decision Tree:", random_dt.best_params_)

✅ Best Decision Tree: {'min_samples_split': np.int64(5), 'min_samples_leaf': np.int64(9), 'max_depth': 5, 'criterion': 'gini'}


In [266]:
from sklearn.model_selection import RandomizedSearchCV

# Reduced search space
param_dist_rf = {
    'n_estimators': [100, 150, 200],           # Keep reasonable
    'max_depth': [3, 5, 7, 10, None],          # Avoid very deep trees
    'min_samples_split': [2, 5, 10],           # Common values
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],          # Important: reduces overfitting & speeds up
    # Removed 'bootstrap': True is default; False rarely better
}

rf = RandomForestClassifier(random_state=42)

# Reduce n_iter and use faster CV
random_rf = RandomizedSearchCV(
    rf,
    param_dist_rf,
    n_iter=20,            # Reduced from 100 → 20
    cv=3,                 # Reduced from 5 → 3 (still stable)
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1             # Optional: see progress
)

# Fit
random_rf.fit(X_train, y_train)

best_rf = random_rf.best_estimator_
print("✅ Best Random Forest:", random_rf.best_params_)

✅ Best Random Forest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 5, 'bootstrap': False}


In [267]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'sigmoid']
}

svm = SVC(probability=True, random_state=42)
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='roc_auc', n_jobs=-1)
grid_svm.fit(X_train, y_train)

best_svm = grid_svm.best_estimator_
print("✅ Best SVM:", grid_svm.best_params_)

✅ Best SVM: {'C': 10, 'gamma': 0.01, 'kernel': 'sigmoid'}


In [270]:
# List of best models
optimized_models = {
    "Logistic Regression": best_lr,
    "Decision Tree": best_dt,
    "Random Forest": best_rf,
    "SVM": best_svm
}

# Evaluate optimized models
optimized_results = []

for name, model in optimized_models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    optimized_results.append([name, acc, prec, rec, f1, auc])

# Create DataFrame
optimized_df = pd.DataFrame(optimized_results, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "AUC"])
print("\n📊 Optimized Model Performance:")
print(optimized_df.round(3))


📊 Optimized Model Performance:
                 Model  Accuracy  Precision  Recall     F1    AUC
0  Logistic Regression     0.885      0.862   0.893  0.877  0.951
1        Decision Tree     0.754      0.810   0.607  0.694  0.735
2        Random Forest     0.869      0.885   0.821  0.852  0.927
3                  SVM     0.836      0.875   0.750  0.808  0.949


In [272]:
# Add identifier
baseline_df['Status'] = 'Baseline'
optimized_df['Status'] = 'Optimized'

# Combine
comparison = pd.concat([baseline_df, optimized_df], axis=0)
comparison['Model_Status'] = comparison['Model'] + ' (' + comparison['Status'] + ')'

# Select only metrics
final_comp = comparison.pivot_table(
    index='Model',
    columns='Status',
    values=['Accuracy', 'Precision', 'Recall', 'F1', 'AUC']
)

print("\n📊 Final Comparison: Baseline vs Optimized")
print(final_comp.round(3))



📊 Final Comparison: Baseline vs Optimized
                         AUC           Accuracy                 F1            \
Status              Baseline Optimized Baseline Optimized Baseline Optimized   
Model                                                                          
Decision Tree          0.774     0.735    0.770     0.754    0.767     0.694   
Logistic Regression    0.953     0.951    0.885     0.885    0.877     0.877   
Random Forest          0.938     0.927    0.869     0.869    0.862     0.852   
SVM                    0.955     0.949    0.902     0.836    0.889     0.808   

                    Precision             Recall            
Status               Baseline Optimized Baseline Optimized  
Model                                                       
Decision Tree           0.719     0.810    0.821     0.607  
Logistic Regression     0.862     0.862    0.893     0.893  
Random Forest           0.833     0.885    0.893     0.821  
SVM                     0.923 