In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
# from ydata_profiling import ProfileReport
from itertools import product

# preprocessing
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# handling detection
from sklearn.ensemble import IsolationForest

# imbalanced data 
from imblearn.over_sampling import SMOTE, SMOTENC

# models
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# hyper-parameter tunning
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer


# evaluation
from sklearn.metrics import\
    accuracy_score, confusion_matrix, classification_report, \
    f1_score, recall_score, balanced_accuracy_score, precision_score, \
    confusion_matrix, ConfusionMatrixDisplay


# storing the best model
import pickle


In [2]:
pd.set_option('display.max_columns', 30)  # Adjust the number as needed

%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 

In [3]:
# loading the dataset
train_data = pd.read_csv("../data/train.csv").drop(columns=["id"])
train_data2 = pd.read_csv("../data/Cir.csv").drop(columns=["ID"]) # original data
data = pd.concat(objs=[train_data, train_data2]).reset_index(drop=True) # combine 2 datasets

# preprocessing

In [4]:
# perform basic pipeline
from src.pipelines import basic_pipeline

# X_train = basic_pipeline.fit_transform(X_train)
data = basic_pipeline.fit_transform(data)

In [5]:
data

Unnamed: 0,N_years,Age,is_male,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status,took_drug
0,2.736986,58.991781,1.0,0,0,0,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,2,1
1,7.052055,52.704110,0.0,0,0,0,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,0,0
2,9.391781,37.608219,0.0,0,1,1,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,2,0
3,7.057534,50.575342,0.0,0,0,0,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,0,0
4,2.158904,45.638356,0.0,0,1,0,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8318,1.865753,67.046575,0.0,0,0,0,N,1.2,,2.96,,,,,174.0,10.9,3.0,2,0
8319,3.021918,39.027397,0.0,0,0,0,N,0.9,,3.83,,,,,180.0,11.2,4.0,0,0
8320,2.890411,57.038356,0.0,0,0,0,N,1.6,,3.42,,,,,143.0,9.9,3.0,0,0
8321,1.893151,58.041096,0.0,0,0,0,N,0.8,,3.75,,,,,269.0,10.4,3.0,0,0


In [None]:
# from src.pipelines import *

# advanced_pipeline = Pipeline(
#     [
#         ("missing_value_preprocess", FunctionTransformer(func=missing_value_imputation, validate=False)),
#         ("outlier_removal_preprocess", FunctionTransformer(func=outlier_removal_quantile, validate=False)),
#         # ("imbalanced_fix_preprocess", FunctionTransformer(func=fix_imbalanced_SMOTE, validate=False)),
#         ("duplicate_removal_preprocess", FunctionTransformer(func=duplicate_removal, validate=False)),
#         ("encoding_preprocess", FunctionTransformer(func=encoding, validate=False)),
#     ]
# )

# advanced_pipeline.fit_transform(X_train)

from src.pipelines import *

advanced_pipeline = Pipeline(
    [   
        ("missing_value_preprocess", FunctionTransformer(func=missing_value_imputation, validate=False)),
        ("outlier_removal_preprocess", FunctionTransformer(func=outlier_removal_quantile, validate=False)),
        # ("imbalanced_fix_preprocess", FunctionTransformer(func=fix_imbalanced_SMOTE, validate=False)),
        ("duplicate_removal_preprocess", FunctionTransformer(func=duplicate_removal, validate=False)),
        ("encoding_preprocess", FunctionTransformer(func=encoding, validate=False)),
    ]
)

data = advanced_pipeline.fit_transform(data)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["Status"]), data["Status"])

# Modeling

In [None]:
import xgboost as xgb



In [30]:
# Bayesian search using skopt
np.int = np.int_ # fo fix the conflict with python version

opt = BayesSearchCV(
    xgb.XGBClassifier(),
    {
        "n_estimators": Integer(10, 100),
        "max_depth": Integer(5, 50),
        "num_class": Categorical([3]),
        "learning_rate": Real(0.01, 0.4, prior="uniform"),
        "booster": Categorical(["gbtree", "gblinear"]),
        # "device": Categorical(["cuda"]),
        # "tree_method": Categorical(["hist"]),
        # "early_stopping_rounds": Categorical([0, 5, 10, 20]),
        "eval_metric":  ["logloss"], # mlogloss
    },
    n_iter=32,
    cv=5,
    scoring='accuracy',
    verbose=3,
    random_state=9090,
)

In [31]:
# final_data = data
# X_train, X_test, y_train, y_test = train_test_split(final_data.drop(columns=["Status"]), final_data["Status"])

In [32]:
X_train.head()

Unnamed: 0,N_years,Age,is_male,Ascites,Hepatomegaly,Spiders,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,took_drug,Edema_N,Edema_S,Edema_Y
6366,2.723288,40.819178,0.0,0,1,1,1.6,291.0,3.57,67.0,1601.0,227.04,106.0,181.0,10.0,4.0,1,1,0,0
40,8.684932,58.210959,0.0,0,0,0,1.5,478.0,3.43,75.0,289.0,97.65,108.0,427.0,9.9,3.0,1,1,0,0
2814,4.021918,61.846575,0.0,0,0,0,2.1,258.0,3.7,69.0,1214.0,158.1,134.0,225.0,12.0,2.0,0,1,0,0
1902,3.928767,56.608219,0.0,0,1,0,0.7,212.0,3.83,41.0,824.0,127.1,85.0,265.0,11.0,4.0,0,1,0,0
2057,4.849315,56.024658,0.0,0,1,1,1.6,226.0,3.35,39.0,1083.0,75.95,56.0,336.0,9.7,3.0,1,1,0,0


In [33]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    opt.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.29609945082934797, max_depth=18, n_estimators=77, num_class=3;, score=0.840 total time=   1.6s
[CV 2/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.29609945082934797, max_depth=18, n_estimators=77, num_class=3;, score=0.824 total time=   1.2s
[CV 3/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.29609945082934797, max_depth=18, n_estimators=77, num_class=3;, score=0.852 total time=   1.1s
[CV 4/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.29609945082934797, max_depth=18, n_estimators=77, num_class=3;, score=0.853 total time=   1.0s
[CV 5/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.29609945082934797, max_depth=18, n_estimators=77, num_class=3;, score=0.818 total time=   1.1s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END booster=gbtree, eval_metric=logloss, learning_rate=0.17757114729

In [34]:
opt.best_score_

0.8388910036797361

In [None]:
# opt.best_score_

0.9122271030090051

# Evaluation 