In [1]:
# set up jupyter
from IPython.display import display, Markdown # pretty display
from IPython.core.interactiveshell import InteractiveShell # multi output

InteractiveShell.ast_node_interactivity = "all" # type: ignore

In [None]:
# import packages
import pandas as pd
from imblearn.over_sampling import SMOTE
from joblib import dump
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from utils.plotter import Plotter # type: ignore

In [None]:
# get preprocessed data
data = pd.read_csv("./dataset/preprocessed.csv") # type: ignore
Markdown("# Preprocessed data")
display(data)

In [None]:
# prep data
labels = data["Label"]
features = data.drop("Label", axis=1)

# splits
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=5)

In [None]:
# create plotter for test results
plotter = Plotter(y_test)

In [None]:
# dataset is very imbalanced, oversample to address this
oversampler = SMOTE(random_state=0)
smote_x, smote_y = oversampler.fit_resample(x_train, y_train) # type: ignore

In [None]:
# train and save decision tree
dtc = DecisionTreeClassifier().fit(smote_x.values, smote_y.values)
dtc_path = "../models/decision_tree.joblib"
_ = dump(dtc, dtc_path)
Markdown(f"Saved decision tree model to `{dtc_path}`")

In [None]:
dtc_pred = dtc.predict(x_test.values)
Markdown("# Results: Decision Tree")
plotter(dtc_pred)

In [None]:
# train and save random forest
rfc = RandomForestClassifier().fit(smote_x.values, smote_y.values)
rfc_path = "../models/random_forest.joblib"
_ = dump(rfc, rfc_path)
Markdown(f"Saved random forest model to `{rfc_path}`")

In [None]:
# test random forest
rfc_pred = rfc.predict(x_test.values)
Markdown("# Results: Random Forest")
plotter(rfc_pred)

In [None]:
# train and save gradient boosting
gbc = GradientBoostingClassifier().fit(smote_x.values, smote_y.values)
gbc_path = "../models/gradient_boosting.joblib"
_ = dump(gbc, gbc_path)
Markdown(f"Saved gradient boosting model to `{rfc_path}`")

In [None]:
# test gradient boosting
gbc_pred = gbc.predict(x_test.values)
Markdown("# Results: Gradient Boosting")
plotter(gbc_pred)