In [1]:
import os
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold


import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv("..\\data\\processed\\clean_training.csv")
df_test = pd.read_csv("..\\data\\processed\\clean_testing.csv")

In [3]:
x_train = df_train.drop(columns=["prognosis"])
y_train = df_train["prognosis"]

In [4]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)


In [5]:
y_train

array([15, 15, 15, 15, 15,  4,  4,  4,  4,  4, 16, 16, 16, 16, 16, 16, 16,
        9,  9,  9,  9,  9,  9,  9,  9, 14, 14, 14, 14, 14, 14, 33, 33, 33,
       33, 33, 33, 33,  1,  1,  1,  1,  1, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 17, 17, 17, 17, 17,  6,  6,  6,  6,  6,  6,  6, 23, 23, 23, 23,
       23, 23, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,  7,  7,  7,  7,  7,
        7, 32, 32, 32, 32, 32, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
       29, 29, 29, 29, 29, 29,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 40, 40, 40, 40, 40, 40, 40, 40, 40, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,  3,  3,  3,  3,  3,  3,
        3,  3, 36, 36, 36, 36, 36, 36, 36, 36, 36, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 34, 34, 34, 34, 34, 34, 34, 34, 34, 13, 13, 13, 13, 13,
       13, 18, 18, 18, 18

In [6]:
x_train.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
small_dents_in_nails    0
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
Length: 125, dtype: int64

In [7]:
y_train

array([15, 15, 15, 15, 15,  4,  4,  4,  4,  4, 16, 16, 16, 16, 16, 16, 16,
        9,  9,  9,  9,  9,  9,  9,  9, 14, 14, 14, 14, 14, 14, 33, 33, 33,
       33, 33, 33, 33,  1,  1,  1,  1,  1, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 17, 17, 17, 17, 17,  6,  6,  6,  6,  6,  6,  6, 23, 23, 23, 23,
       23, 23, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,  7,  7,  7,  7,  7,
        7, 32, 32, 32, 32, 32, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
       29, 29, 29, 29, 29, 29,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 40, 40, 40, 40, 40, 40, 40, 40, 40, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,  3,  3,  3,  3,  3,  3,
        3,  3, 36, 36, 36, 36, 36, 36, 36, 36, 36, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 34, 34, 34, 34, 34, 34, 34, 34, 34, 13, 13, 13, 13, 13,
       13, 18, 18, 18, 18

In [8]:
x_train.shape

(304, 125)

In [9]:
y_train.shape

(304,)

In [10]:
x_test = df_test.drop(columns=["prognosis"])
y_test = df_test["prognosis"]

In [11]:
y_test = le.transform(y_test)

In [12]:
x_test.shape

(42, 125)

In [13]:
y_test.shape

(42,)

In [14]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

In [15]:
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc

    report = classification_report(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}\nClassification Report:\n{report}")


Training LogisticRegression...
LogisticRegression Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00 

In [16]:
top_models_list = sorted(results.items(), key=lambda x: x[1], reverse=True)[:3]
print(f"\nTop {3} models:", top_models_list)


Top 3 models: [('LogisticRegression', 1.0), ('CatBoost', 1.0), ('RandomForest', 0.9761904761904762)]


In [17]:
param_grids = {
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "solver": ["newton-cg", "saga", "lbfgs"],
        "penalty": ["l2", "none"]
    },
    "RandomForest": {
        "n_estimators": [100, 200, 300, 400],
        "max_depth": [5, 10, 20, 30, 40, 50],
        "min_samples_split": [5, 10, 15, 20, 25],
        "min_samples_leaf": [1, 2, 3, 4, 5, 6],
        "bootstrap": [True, False]
    },
    "CatBoost": {
        "iterations": [200, 300, 400, 500],
        "depth": [2, 6, 10, 12],
        "learning_rate": [0.01, 0.05, 0.07, 0.1],
        "l2_leaf_reg": [1, 3, 5, 7]
    }
}


In [18]:
def tune_model_grid(model, param_grid, x_train, y_train, cv=3):
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring="accuracy",
        cv=StratifiedKFold(n_splits=cv, shuffle=True, random_state=42),
        verbose=2,
        n_jobs=-1
    )
    grid.fit(x_train, y_train)
    return grid.best_estimator_, grid.best_params_, grid.best_score_


In [25]:
best_models = {}
for name, _ in [top_models_list[-1]]:
    print(f"\nHyperparameter tuning for {name}...")
    tuned_model, best_params, best_score = tune_model_grid(
        models[name], param_grids[name], x_train, y_train
    )
    best_models[name] = {
        "model": tuned_model,
        "best_params": best_params,
        "cv_score": best_score
    }
    print(f"Best {name}: {best_params} (CV Score={best_score:.4f})")



Hyperparameter tuning for RandomForest...
Fitting 3 folds for each of 1440 candidates, totalling 4320 fits
Best RandomForest: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100} (CV Score=1.0000)


In [27]:
for name, info in best_models.items():
    preds = info["model"].predict(x_test)
    acc = accuracy_score(y_test, preds)
    print(f"\n{name} Test Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))


RandomForest Test Accuracy: 0.9762
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       0.50      1.00      0.67         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       1.00      0.50      0.67         2
          16       1.00      1.00      1.00  

In [28]:
# Pick best tuned model by CV score
final_best = max(best_models.items(), key=lambda x: x[1]["cv_score"])
best_name, best_info = final_best
print(f"\nFinal Best Model: {best_name} with CV Score {best_info['cv_score']:.4f}")



Final Best Model: RandomForest with CV Score 1.0000


In [29]:
model_path = os.path.join("../models", f"final_model.joblib")
joblib.dump(best_info["model"], model_path)
joblib.dump(le, "../models/label_encoder.joblib")

['../models/label_encoder.joblib']

In [30]:
pred = model.predict([x_test.iloc[30]])
print(le.inverse_transform(pred), pred)

['Varicose veins'] [[39]]


In [31]:
y_test

array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
       11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
       31,  5,  0,  2, 38, 35, 27, 15])