In [1]:
from src.model_dev import *
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
# Load the dataset
data = pd.read_csv('../data/hd_data_preprocessed.csv')

In [3]:
# Split features and target
X = data.drop('num', axis=1)
y = data['num']

In [4]:
Label_Encoder = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype == 'category':
        X[col] = Label_Encoder.fit_transform(X[col])
    else:
        pass

In [5]:
# Split the data
X_train, X_test, y_train, y_test = split_data(X, y)

In [6]:
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('KNeighbors', KNeighborsClassifier()),
    ('SVM', SVC(probability=True, random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42))
]

In [7]:
metric_weights = {
    'Accuracy': 0.2,
    'Precision': 0.2,
    'Recall': 0.2,
    'F1 Score': 0.3,
    'ROC-AUC': 0.1
}

results, best_model, best_model_name = evaluate_models(X_train, X_test, y_train, y_test, models, metric_weights)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 381
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 15
[LightGBM] [Info] Start training from score -1.149906
[LightGBM] [Info] Start training from score -1.003302
[LightGBM] [Info] Start training from score -2.014903
[LightGBM] [Info] Start training from score -1.984131
[LightGBM] [Info] Start training from score -3.082744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 380
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 15
[LightGBM] [Info] Start training from score -1.149906
[LightGBM] [Info] Start training from score -1.003302
[LightGBM] [Info] Start training from score -2.014

In [8]:
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        if metric != "Confusion Matrix":
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}:\n{value}")
    print()

Model: Logistic Regression
Cross-Validation Accuracy: 0.5667
Test Accuracy: 0.6047
Precision: 0.4243
Recall: 0.4051
F1 Score: 0.4088
Confusion Matrix:
[[46  7  0  1  1]
 [ 5 23  3  3  1]
 [ 1  2  2  4  2]
 [ 0  8  0  7  5]
 [ 0  3  1  4  0]]
ROC-AUC: 0.8096
Weighted Score: 0.4904

Model: Random Forest
Cross-Validation Accuracy: 0.6867
Test Accuracy: 0.6512
Precision: 0.4372
Recall: 0.4583
F1 Score: 0.4474
Confusion Matrix:
[[47  5  3  0  0]
 [ 7 25  2  1  0]
 [ 2  1  3  5  0]
 [ 2  4  3  9  2]
 [ 0  2  1  5  0]]
ROC-AUC: 0.8653
Weighted Score: 0.5301

Model: Gradient Boosting
Cross-Validation Accuracy: 0.6600
Test Accuracy: 0.6589
Precision: 0.4892
Recall: 0.4855
F1 Score: 0.4855
Confusion Matrix:
[[48  4  3  0  0]
 [ 6 23  3  3  0]
 [ 2  2  3  4  0]
 [ 0  5  1 10  4]
 [ 0  2  2  3  1]]
ROC-AUC: 0.8112
Weighted Score: 0.5535

Model: KNeighbors
Cross-Validation Accuracy: 0.5667
Test Accuracy: 0.5736
Precision: 0.3797
Recall: 0.3778
F1 Score: 0.3703
Confusion Matrix:
[[44  8  2  1  0]
 [

In [9]:
print(f"Best Model: {best_model.named_steps['model']}")
print(f"Best Model Name: {best_model_name}")
print(f"Best Weighted Score: {results[best_model_name]['Weighted Score']:.4f}")

Best Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Best Model Name: XGBoost
Best Weighted Score: 0.5692


In [10]:
tuned_models = tune_all_models(X_train, y_train, X_test, y_test, metric_weights=metric_weights)


Tuning Logistic Regression...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Logistic Regression:
{'model__solver': 'saga', 'model__penalty': 'l2', 'model__C': 29.763514416313132}
Weighted score: 0.4616

Tuning Random Forest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Random Forest:
{'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'log2', 'model__max_depth': 7}
Weighted score: 0.5422

Tuning Gradient Boosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for Gradient Boosting:
{'model__subsample': 0.7999999999999999, 'model__n_estimators': 900, 'model__max_features': 'sqrt', 'model__max_depth': 9, 'model__learning_rate': 0.046415888336127774}
Weighted score: 0.5371

Tuning KNeighbors...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters for KNeighbors:
{'model__weights': 'uniform', 'mod

In [11]:
best_model_name = max(tuned_models, key=lambda x: tuned_models[x].get('best_score', float('-inf')))
best_model = tuned_models[best_model_name]['best_estimator']
print(f"\nBest overall model: {best_model_name}")
print(f"Best overall score: {tuned_models[best_model_name]['best_score']:.4f}")


Best overall model: LightGBM
Best overall score: 0.5615


In [12]:
sorted_models = sorted(tuned_models.items(), key=lambda x: x[1]['best_score'], reverse=True)

In [19]:
selected_models = ['LightGBM', 'Gradient Boosting', 'XGBoost', 'Random Forest', 'Logistic Regression']
models_info = {}
for i, (model_name, model_info) in enumerate(sorted_models):
    if model_name in selected_models:
        rank = i + 1
        
        print(f"Rank {rank} model ({model_name})")
        print(f"Score: {model_info['best_score']:.4f}")
        print(f"Parameters: {model_info['best_params']}")
        print()
        
        models_info[model_name] = {
            'rank': rank,
            'score': model_info['best_score'],
            'pipeline': model_info['best_estimator'],
            'parameters': model_info['best_params'],
            'metrics': model_info['metrics']
        }

joblib.dump(models_info, '../models/initial_models_info.joblib')
print("Information about all models saved to '../models/initial_models_info.joblib'")

Rank 1 model (LightGBM)
Score: 0.5615
Parameters: {'model__subsample': 0.7999999999999999, 'model__num_leaves': 40, 'model__n_estimators': 400, 'model__max_depth': 4, 'model__learning_rate': 0.021544346900318832, 'model__colsample_bytree': 0.6}

Rank 2 model (Random Forest)
Score: 0.5422
Parameters: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'log2', 'model__max_depth': 7}

Rank 3 model (Gradient Boosting)
Score: 0.5371
Parameters: {'model__subsample': 0.7999999999999999, 'model__n_estimators': 900, 'model__max_features': 'sqrt', 'model__max_depth': 9, 'model__learning_rate': 0.046415888336127774}

Rank 4 model (XGBoost)
Score: 0.5328
Parameters: {'model__subsample': 0.6, 'model__n_estimators': 500, 'model__min_child_weight': 9, 'model__max_depth': 8, 'model__learning_rate': 0.046415888336127774, 'model__colsample_bytree': 0.8999999999999999}

Rank 7 model (Logistic Regression)
Score: 0.4616
Parameters: {'model__solve

In [16]:
print("\nAll Models Summary:")
for model_name, info in all_models_info.items():
    print(f"Rank {info['rank']}: {model_name} (Score: {info['score']:.4f})")
    print(f"Parameters: {info['parameters']}")
    print(f"Metrics:")
    for metric_name, metric_value in info['metrics'].items():
        if metric_name != "Confusion Matrix":
            print(f"  {metric_name}: {metric_value:.4f}")
        else:
            print(f"  {metric_name}:\n{metric_value}")
    print()


All Models Summary:
Rank 1: LightGBM (Score: 0.5615)
Parameters: {'model__subsample': 0.7999999999999999, 'model__num_leaves': 40, 'model__n_estimators': 400, 'model__max_depth': 4, 'model__learning_rate': 0.021544346900318832, 'model__colsample_bytree': 0.6}
Metrics:
  accuracy: 0.6667
  precision: 0.5015
  recall: 0.4892
  f1: 0.4905
  roc_auc: 0.8284

Rank 2: Random Forest (Score: 0.5422)
Parameters: {'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'log2', 'model__max_depth': 7}
Metrics:
  accuracy: 0.6667
  precision: 0.4546
  recall: 0.4656
  f1: 0.4587
  roc_auc: 0.8724

Rank 3: Gradient Boosting (Score: 0.5371)
Parameters: {'model__subsample': 0.7999999999999999, 'model__n_estimators': 900, 'model__max_features': 'sqrt', 'model__max_depth': 9, 'model__learning_rate': 0.046415888336127774}
Metrics:
  accuracy: 0.6589
  precision: 0.4641
  recall: 0.4564
  f1: 0.4576
  roc_auc: 0.8390

Rank 4: XGBoost (Score: 0.5328)