In [1]:
import mlflow
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import logging
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import plotly.express as px

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    hamming_loss,
    log_loss,
    precision_score,
    recall_score,
    multilabel_confusion_matrix,
    classification_report,
    precision_recall_curve,
    average_precision_score,
    precision_recall_fscore_support,
    mean_absolute_error,
)

In [4]:
df_train_eng = pd.read_csv("../data/engineered/train.csv")

In [5]:
df_train_selected = pd.read_csv("../data/feature-selected/double/Mixed/train.csv")

In [6]:
df_train_original = pd.read_csv("../data/outlier_removed/train.csv")

In [7]:
def get_cat_features(df: pd.DataFrame, label: str) -> list[str]:
    cat_features = [
        feature
        for feature in df.columns
        if feature != label
        and (df[feature].dtype == "O" or df[feature].dtype == "bool" or df[feature].dtype == "category")
    ]
    return cat_features


def get_fit_cat_params(estimator_class: str, cat_col_list=[]) -> dict:
    if cat_col_list is None:
        cat_col_list = []  # Default to an empty list if no categories are provided

    estimator_classes = {
        "CatBoostClassifier": {"cat_features": cat_col_list},
        "CatBoostRegressor": {"cat_features": cat_col_list},
        "LGBMClassifier": {"categorical_feature": cat_col_list},
        "LGBMRegressor": {"categorical_feature": cat_col_list},
        "HistGradientBoostingClassifier": {"categorical_features": cat_col_list},
        "HistGradientBoostingRegressor": {"categorical_features": cat_col_list},
    }
    # Directly return the appropriate dictionary or an empty dictionary if the class is not found
    return estimator_classes.get(estimator_class, {})


def get_single_metric(metric_name: str):
    """
    Get metric by its name
    :param metric_name: Name of desired encoder
    :param metric_kwargs: kwrags for metric object
    :return: metric object
    """
    metric_classes = {
        "accuracy": accuracy_score,
        "roc_auc": roc_auc_score,
        "f1": f1_score,
        "confusion_matrix": confusion_matrix,
        "precision": precision_score,
        "recall": recall_score,
        "precision_recall_curve": precision_recall_curve,
        "average_precision": average_precision_score,
        "precision_recall_fscore_support": precision_recall_fscore_support,
        "MAE": mean_absolute_error,
    }

    metric_class = metric_classes.get(metric_name)
    if metric_class:
        return metric_class
    else:
        raise ValueError(f"Metric name '{metric_name}' is not supported.")


In [8]:
class cv_training(BaseEstimator, ClassifierMixin):
    """
    This class performs cross-validated training for a given estimator on a provided dataset. It integrates
    scikit-learn's BaseEstimator and TransformerMixin to support pipelining and consistent interface with
    scikit-learn tools.

    Attributes:
        estimator (object): The machine learning estimator object that adheres to scikit-learn's estimator interface.
        params (dict): Parameters for initializing the estimator.
        random_state (int): Random state to ensure reproducibility.
        cv (object): Cross-validation strategy object.
        n_splits (int): Number of splits for the cross-validation.
        estimators (list): List of trained estimator objects from each fold.
        features (list): List of feature names used for training.
        fit_kwargs (dict): Additional keyword arguments for the `fit` method of the estimator.
        predict_kwargs (dict): Additional keyword arguments for the `predict` method of the estimator.
        metrics (dict): Dictionary storing the scores for each metric across folds.
        metrics_stats (dict): Dictionary storing the statistical measures (mean, median, std, final) for each metric.

    Args:
        n_splits (int): Number of splits for cross-validation.
        estimator (object): Estimator to be used for training.
        params (dict, optional): Dictionary of parameters to initialize the estimator.
        random_state (int, optional): Seed for the random number generator used in cross-validation.

    Raises:
        ValueError: If the label column, metric list, or metric optimization direction list is not properly defined.
    """

    def __init__(self, n_splits: int = 10, estimator=None, params: dict = {}, random_state: int = 42):
        self.estimator = estimator
        self.params = params
        self.random_state = random_state
        self.cv = None
        self.n_splits = n_splits
        self.estimators = []
        self.features = []
        self.conf_matrices = []  # Store confusion matrices for each fold
        self.train_preds = []

    def fit(
        self,
        df: pd.DataFrame,
        label: str = None,
        fit_kwargs: dict = {},
        predict_kwargs: dict = {},
        metric_list: list[str] = [],
        metric_opt_dir_list: list[str] = [],
        metric_kwargs: dict = {},
    ):
        """
        Fits the estimator to the data using specified cross-validation strategy and computes metrics for each fold.

        Args:
            df (pd.DataFrame): DataFrame containing the training data.
            label (str): Name of the target variable column.
            fit_kwargs (dict): Additional keyword arguments for the `fit` method of the estimator.
            predict_kwargs (dict): Additional keyword arguments for the `predict` method of the estimator.
            metric_list (list of str): List of metric names to evaluate.
            metric_opt_dir_list (list of str): List specifying the direction ('min' for minimization, 'max' for maximization or 'compr' for comprehensive information) for each metric's optimization.
            metric_kwargs (dict): Additional keyword arguments for each metric computation.

        Returns:
            self: Returns an instance of self.
        """
        logging.info("Starting the fitting process.")
        print("Starting the fitting process.")

        if not label:
            raise ValueError("Label column must be specified.")
        if not metric_list:
            raise ValueError("Metric list must not be empty.")
        if not metric_opt_dir_list:
            raise ValueError("Metric optimization direction list must not be empty.")

        # First, we prepare the features X and the label Y for the training
        self.label = label
        Y = df.loc[:, self.label]
        X = df.drop(columns=self.label)
        self.features = X.columns

        # Second, we initialize the estimator's fit and predict method and the metrics, as well as the dictionaries for metrics and metric stats
        self.fit_kwargs = fit_kwargs
        self.predict_kwargs = predict_kwargs

        if metric_kwargs == {}:
            self.metric_kwargs = {metric_name: {} for metric_name in metric_list}
        else:
            self.metric_kwargs = metric_kwargs

        self.metrics = {metric_name: [] for metric_name in metric_list}
        self.metrics_stats = {
            metric_name: {"mean": 0.0, "median": 0.0, "std": 0.0, "final": 0.0} for metric_name in metric_list
        }

        self.cv = (
            StratifiedKFold(n_splits=self.n_splits, random_state=self.random_state)
            if (Y.dtype == "O" or Y.dtype == bool or Y.dtype == "category")
            else KFold(n_splits=self.n_splits, random_state=self.random_state, shuffle=True)
        )
        # Then, we start cross-validated training process
        for n_fold, (train_index, val_index) in enumerate(self.cv.split(X, Y)):

            logging.info(f"Starting training for fold {n_fold+1}")
            print(f"Starting training for fold {n_fold+1}")

            # Define X_train, Y_train, X_val, Y_val
            X_train, Y_train = X.iloc[train_index], Y.iloc[train_index]
            X_val, Y_val = X.iloc[val_index], Y.iloc[val_index]

            # Fit the estimator and append it to the list "self.estimators"
            estimator = self.estimator(**self.params)
            estimator.fit(X_train, Y_train, **fit_kwargs)
            self.estimators.append(estimator)
            
            # Make predictions
            y_pred = estimator.predict(X_val, **predict_kwargs)
            if "Classifier" in self.estimator.__name__:
                y_pred_proba = estimator.predict_proba(X_val, **predict_kwargs)[:, 1]
            self.train_preds.append(pd.Series(y_pred, index=val_index))
            
            # Get scores for each metric
            for metric_name in metric_list:
                metric = get_single_metric(metric_name)
                if metric_name in ["roc_auc", "average_precision", "precision_recall_curve"]:
                    result = metric(Y_val, y_pred_proba, **self.metric_kwargs[metric_name])
                else:
                    result = metric(Y_val, y_pred, **self.metric_kwargs[metric_name])
                self.metrics[metric_name].append(result)
            logging.info(f"Completed training for fold {n_fold+1}")
            print(f"Completed training for fold {n_fold+1}")

        # Finally, print some stats for training
        for metric_name, metric_opt_dir in zip(metric_list, metric_opt_dir_list):

            mean_score = np.mean(self.metrics[metric_name])
            self.metrics_stats[metric_name]["mean"] = mean_score

            median_score = np.median(self.metrics[metric_name])
            self.metrics_stats[metric_name]["median"] = median_score

            std_score = np.std(self.metrics[metric_name])
            self.metrics_stats[metric_name]["std"] = std_score

            if metric_opt_dir == "max":
                final_score = np.min([mean_score, median_score])
            elif metric_opt_dir == "min":
                final_score = np.max([mean_score, median_score])
            elif metric_opt_dir == "compr":
                final_score = np.stack(self.metrics[metric_name])
                final_score = np.sum(final_score, axis=0)
                plot_confusion_matrix(final_score, class_labels=["False", "True"])
            else:
                raise ValueError(
                    "metric_opt_dir as the direction of the metric optimization can either be 'min' for minimize and 'max' for maximize"
                )
            self.metrics_stats[metric_name]["final"] = final_score

            print("%" * 100)
            logging.info(
                f"The metric scores in all cv folds for {metric_name} are {self.metrics[metric_name]}. \n The final score is {final_score}, and the standard deviation is {std_score}"
            )
            print(
                f"The metric scores in all cv folds for {metric_name} are {self.metrics[metric_name]}. \n The final score is {final_score}, and the standard deviation is {std_score}"
            )
            print("%" * 100)

        return self

    def predict(self, df: pd.DataFrame, majority_threshold: int) -> pd.Series:
        """
        Applies the trained model to predict the target variable on a new dataset.

        Args:
            df (pd.DataFrame): DataFrame containing the new data on which predictions are to be made.

        Returns:
            df (pd.DataFrame): DataFrame with the predictions added as a new column corresponding to the label attribute.

        Raises:
            ValueError: If the model has not been trained before calling this method.
        """
        if not self.estimators:
            logging.error("Please first train the model using fit before making predictions")
            raise ValueError("Please first train the model using fit before making predictions")

        # Hard voting
        # Collect predictions from each fold's estimator
        predictions = [estimator.predict(df) for estimator in self.estimators]
        # Use mode (majority voting) for final prediction
        if majority_threshold is None:
            majority_threshold = len(self.estimators) // 2 + 1  # Simple majority

        # Compute the majority vote based on a custom threshold
        hard_vote_predictions = np.sum(predictions, axis=0) >= majority_threshold

        # Soft voting
        # Check if each estimator has a 'predict_proba' method
        if not all(hasattr(estimator, 'predict_proba') for estimator in self.estimators):
            raise AttributeError("All estimators must support the 'predict_proba' method for soft voting.")
        # Collect probability predictions from each fold's estimator
        prob_predictions = [estimator.predict_proba(df) for estimator in self.estimators]
        # Average the probability predictions
        mean_prob_predictions = np.mean(prob_predictions, axis=0)

        # Convert probabilities to final class predictions
        # This step is typical for binary classifications, adjust as necessary for multi-class
        soft_vote_predictions = np.argmax(mean_prob_predictions, axis=1)

        logging.info("Prediction completed for the test set")
        print("Prediction completed for the test set")

        return hard_vote_predictions.astype("int"), soft_vote_predictions


def plot_confusion_matrix(cm, class_labels=None):
    # Default labels to 0 and 1 if none are provided
    if class_labels is None:
        class_labels = ["0", "1"]

    # Define the confusion matrix data and annotations
    z = cm
    x = class_labels
    y = class_labels
    z_text = [[str(y) for y in x] for x in z]

    # Create the confusion matrix as a heatmap
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale="Blues")

    # Add title and axis labels
    fig.update_layout(title="Confusion Matrix", xaxis=dict(title="Predicted value"), yaxis=dict(title="Actual value"))

    # Reverse the y-axis to put '0' at the top
    fig["layout"]["yaxis"]["autorange"] = "reversed"

    return fig



In [9]:
base_params = {"verbose": False, "eval_metric": "Accuracy"}

In [10]:
best_model_id = 'runs:/ace8ebc55f8b41709a69c7a7d46e8812/CatBoost_Mixed_best_trial_8'
best_params = mlflow.sklearn.load_model(best_model_id).params

In [11]:
best_model = cv_training(estimator=CatBoostClassifier, params=best_params)

In [12]:
best_model.fit(df_train_selected, label="Transported", metric_list=["accuracy", "roc_auc", "f1"], metric_opt_dir_list=["max", "max","max"])

Starting the fitting process.
Starting training for fold 1
Completed training for fold 1
Starting training for fold 2
Completed training for fold 2
Starting training for fold 3
Completed training for fold 3
Starting training for fold 4
Completed training for fold 4
Starting training for fold 5
Completed training for fold 5
Starting training for fold 6
Completed training for fold 6
Starting training for fold 7
Completed training for fold 7
Starting training for fold 8
Completed training for fold 8
Starting training for fold 9
Completed training for fold 9
Starting training for fold 10
Completed training for fold 10
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
The metric scores in all cv folds for accuracy are [0.7979214780600462, 0.8106235565819861, 0.7956120092378753, 0.812933025404157, 0.8071593533487298, 0.8094688221709007, 0.8025404157043879, 0.8221709006928406, 0.792147806004619, 0.8057803468208092]. 
 The final score is 0.805

In [13]:
y_pred = pd.concat(best_model.train_preds).sort_index()

In [14]:
df_train = df_train_selected.copy()
df_train["Age"] = df_train_original["Age"]
df_train["Pred"] = y_pred

In [15]:
TP = (df_train["Pred"] == df_train["Transported"]) & (df_train["Transported"] == 1)
TN = (df_train["Pred"] == df_train["Transported"]) & (df_train["Transported"] == 0)
FP = (df_train["Pred"] == 1) & (df_train["Transported"] == 0)
FN = (df_train["Pred"] == 0) & (df_train["Transported"] == 1)

In [18]:
cryo = df_train["CryoSleep_False"]==0.0

In [22]:
df_train_original["Pred"] = y_pred

In [38]:
df_train_original.loc[(df_train_original.HomePlanet=="Europa") & (df_train_original.CryoSleep==True) & TP]

Unnamed: 0,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,ID_num,Group_size,HomePlanet,Cabin_side,Transported,Pred
9,True,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,B,1,3,Europa,P,1,1
10,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,B,2,3,Europa,P,1,1
28,True,TRAPPIST-1e,62.0,False,0.0,0.0,0.0,0.0,0.0,C,1,1,Europa,S,1,1
36,True,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,D,1,1,Europa,S,1,1
53,True,TRAPPIST-1e,38.0,False,0.0,0.0,0.0,0.0,0.0,A,2,3,Europa,S,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8602,True,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,B,1,1,Europa,S,1,1
8636,True,55 Cancri e,19.0,False,0.0,0.0,0.0,0.0,0.0,B,2,2,Europa,P,1,1
8643,True,55 Cancri e,30.0,False,0.0,0.0,0.0,0.0,0.0,E,1,1,Europa,S,1,1
8644,True,TRAPPIST-1e,21.0,False,0.0,0.0,0.0,0.0,0.0,E,1,2,Europa,S,1,1


In [67]:
# Create the scatter plot
fig = px.scatter_3d(df_train, x="Consumption_High_End", y="Consumption_Basic", z="Age",  color="Transported", title="Simple Scatter Plot")

# Updating layout for a better visualization
fig.update_layout(
    xaxis_title="Consumption_High_End",
    yaxis_title="Consumption_Basic",
    legend_title="Transported",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=1200, height=1200
)
fig.update_xaxes(range=[0, 15000])  # Set x-axis range from 0 to 6
fig.update_yaxes(range=[0, 15000])  # Set y-axis range from 0 to 120000
# Show the plot
fig.show()

In [69]:
# Create the scatter plot
fig = px.scatter_3d(df_train[FN], x="Consumption_High_End", y="Consumption_Basic", z="Age",  color="HomePlanet", title="Simple Scatter Plot")

# Updating layout for a better visualization
fig.update_layout(
    xaxis_title="Consumption_High_End",
    yaxis_title="Consumption_Basic",
    legend_title="Transported",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    width=1200, height=1200
)
fig.update_xaxes(range=[0, 15000])  # Set x-axis range from 0 to 6
fig.update_yaxes(range=[0, 15000])  # Set y-axis range from 0 to 120000
# Show the plot
fig.show()