# Setup
In this notebook section, we will import the libraries needed to run this code.

In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

# Constant Variables
In a Jupyter Notebook, creating constant variables can be important for several reasons:

* **Readability and Maintainability**: Using constant variables with meaningful names can improve the readability of your code. It makes it easier for others (or even yourself in the future) to understand the purpose of the values being used throughout the notebook.

* **Code Consistency**: By defining constants, you ensure that specific values are consistently used across the notebook. If you need to change the value later, you only have to modify it in one place, reducing the risk of errors due to inconsistent values.

* **Preventing Magic Numbers**: Magic numbers are hardcoded numeric values scattered throughout the code without any explanation or context. Using constants instead of magic numbers makes the code self-documenting and provides context for the values used.

* **Flexibility**: If you need to change a value that is used in multiple places, having it defined as a constant allows you to change it once, and the change will automatically apply throughout the notebook.

* **Easy Debugging**: When debugging the code, having constants allows you to quickly check the values being used in different parts of the notebook without having to search for where they are defined.

* **Unit Testing**: If you plan to write unit tests for your code, using constants can make it easier to define test cases and assert expected results.

In [3]:
DATASETS_DIR = 'datasets/'
URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
# DROP_COLS = ['boat','body','home.dest','ticket','name']
DROP_COLS = ['boat','body','home.dest','ticket','name', 'cabin']
RETRIEVED_DATA = 'raw-data.csv'


SEED_SPLIT = 404
TRAIN_DATA_FILE = f'{DATASETS_DIR}train.csv'
TEST_DATA_FILE = f'{DATASETS_DIR}test.csv'


TARGET = 'survived'
# FEATURES = ['pclass','sex','age','sibsp','parch','fare','cabin','embarked','title']
FEATURES = ['pclass','age','sibsp','parch','fare','embarked','title']
NUMERICAL_VARS = ['pclass','age','sibsp','parch','fare']
# CATEGORICAL_VARS = ['sex','cabin','embarked','title']
CATEGORICAL_VARS = ['embarked','title']


NUMERICAL_VARS_WITH_NA = ['age','fare']
CATEGORICAL_VARS_WITH_NA = ['cabin','embarked']
NUMERICAL_NA_NOT_ALLOWED = [var for var in NUMERICAL_VARS if var not in NUMERICAL_VARS_WITH_NA]
CATEGORICAL_NA_NOT_ALLOWED = [var for var in CATEGORICAL_VARS if var not in CATEGORICAL_VARS_WITH_NA]


SEED_MODEL = 404

# Functions
Writing functions will help us for several things, for example:
* **Modularity**: Functions allow you to break down complex problems into smaller, manageable pieces. Each function can handle a specific task, making the code easier to understand, test, and maintain. This concept is known as "modularity."

* **Reusability**: Once you define a function, you can use it multiple times throughout your code or even in other projects. This promotes code reuse and saves time since you don't have to rewrite the same logic each time you need it.

In [5]:
def data_retrieval(url):

    # Loading data from specific url
    data = pd.read_csv(url)

    # Uncovering missing data
    data.replace('?', np.nan, inplace=True)
    data['age'] = data['age'].astype('float')
    data['fare'] = data['fare'].astype('float')

    # helper function 1
    def get_first_cabin(row):
        try:
            return row.split()[0]
        except Exception:
            return np.nan

    # helper function 2
    def get_title(passenger):
        line = passenger
        if re.search('Mrs', line):
            return 'Mrs'
        elif re.search('Mr', line):
            return 'Mr'
        elif re.search('Miss', line):
            return 'Miss'
        elif re.search('Master', line):
            return 'Master'
        else:
            return 'Other'

    # Keep only one cabin | Extract the title from 'name'
    data['cabin'] = data['cabin'].apply(get_first_cabin)
    data['title'] = data['name'].apply(get_title)

    # Droping irrelevant columns
    data.drop(DROP_COLS, axis=1, inplace=True)

    # Create directory if it does not exist
    if not os.path.exists(DATASETS_DIR):
        os.makedirs(DATASETS_DIR)
        print(f"Directory '{DATASETS_DIR}' created successfully.")
    else:
        print(f"Directory '{DATASETS_DIR}' already exists.")
    
    # Save data to CSV file
    data.to_csv(DATASETS_DIR + RETRIEVED_DATA, index=False)

    return print('Data stored in {}'.format(DATASETS_DIR + RETRIEVED_DATA))

data_retrieval(URL)

Directory 'datasets/' created successfully.
Data stored in datasets/raw-data.csv


# Custom Transformers
Custom transformers are really important if we want to have high-quality code, able to be maintaned, changed and be reused by other pieces of code.

The following code is the migration from [3-create-convenient-classes.ipynb](../session-7/3-create-convenient-classes.ipynb) notebook.

In [4]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)
X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

In [6]:
class MissingIndicator(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to create indicator features for missing values in specified variables.

    Parameters:
        variables (list or str, optional): List of column names (variables) to create indicator features for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        variables (list): List of column names (variables) to create indicator features for.

    Methods:
        fit(X, y=None):
            This method does not perform any actual training or fitting.
            It returns the transformer instance itself.

        transform(X):
            Creates indicator features for missing values in the specified variables and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    missing_indicator = MissingIndicator(variables=['age', 'income'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('missing_indicator', missing_indicator),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, variables=None):
        """
        Initialize the MissingIndicator transformer.

        Parameters:
            variables (list or str, optional): List of column names (variables) to create indicator features for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        """
        This method does not perform any actual training or fitting, as indicator features are created based on data.
        It returns the transformer instance itself.

        Parameters:
            X (pd.DataFrame): Input data to be transformed. Not used in this method.
            y (pd.Series or np.array, optional): Target variable. Not used in this method.

        Returns:
            self (MissingIndicator): The transformer instance.
        """
        return self

    def transform(self, X):
        """
        Creates indicator features for missing values in the specified variables and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with additional indicator features for missing values.
        """
        X = X.copy()
        for var in self.variables:
            X[f'{var}_nan'] = X[var].isnull().astype(int)

        return X
    
# create_missing_flag = MissingIndicator(variables=NUMERICAL_VARS)
# X_train = create_missing_flag.transform(X_train)
# X_test = create_missing_flag.transform(X_test)

In [6]:
class ExtractLetters(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to extract letters from a specified variable.

    Parameters:
        None

    Attributes:
        variable (str): The name of the column (variable) from which letters will be extracted.

    Methods:
        fit(X, y=None):
            This method does not perform any actual training or fitting.
            It returns the transformer instance itself.

        transform(X):
            Extracts letters from the specified variable and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    extractor = ExtractLetters()

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('extractor', extractor),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self):
        """
        Initialize the ExtractLetters transformer.

        Parameters:
            None
        """
        self.variable = 'cabin'

    def fit(self, X, y=None):
        """
        This method does not perform any actual training or fitting, as it is not necessary for this transformer.
        It returns the transformer instance itself.

        Parameters:
            X (pd.DataFrame): Input data to be transformed. Not used in this method.
            y (pd.Series or np.array, optional): Target variable. Not used in this method.

        Returns:
            self (ExtractLetters): The transformer instance.
        """
        return self

    def transform(self, X):
        """
        Extracts letters from the specified variable and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with letters extracted from the specified variable.
        """
        X = X.copy()
        X[self.variable] = X[self.variable].apply(lambda x: ''.join(re.findall("[a-zA-Z]+", x)) if type(x)==str else x)
        return X

In [7]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to impute missing values in categorical variables.

    Parameters:
        variables (list or str, optional): List of column names (variables) to impute missing values for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        variables (list): List of column names (variables) to impute missing values for.

    Methods:
        fit(X, y=None):
            This method does not perform any actual training or fitting.
            It returns the transformer instance itself.

        transform(X):
            Imputes missing values in the specified categorical variables and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    imputer = CategoricalImputer(variables=['category1', 'category2'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('imputer', imputer),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, variables=None):
        """
        Initialize the CategoricalImputer transformer.

        Parameters:
            variables (list or str, optional): List of column names (variables) to impute missing values for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        """
        This method does not perform any actual training or fitting, as imputation is based on data.
        It returns the transformer instance itself.

        Parameters:
            X (pd.DataFrame): Input data to be transformed. Not used in this method.
            y (pd.Series or np.array, optional): Target variable. Not used in this method.

        Returns:
            self (CategoricalImputer): The transformer instance.
        """
        return self

    def transform(self, X):
        """
        Imputes missing values in the specified categorical variables and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with missing values imputed for the specified categorical variables.
        """
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna('Missing')
        return X

In [8]:
class NumericalImputer(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to impute missing values in numerical variables.

    Parameters:
        variables (list or str, optional): List of column names (variables) to impute missing values for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        variables (list): List of column names (variables) to impute missing values for.
        median_dict_ (dict): Dictionary to store the median values for each specified numerical variable during fitting.

    Methods:
        fit(X, y=None):
            Calculates the median values for the specified numerical variables from the training data.
            It returns the transformer instance itself.

        transform(X):
            Imputes missing values in the specified numerical variables using the median values and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    imputer = NumericalImputer(variables=['age', 'income'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('imputer', imputer),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, variables=None):
        """
        Initialize the NumericalImputer transformer.

        Parameters:
            variables (list or str, optional): List of column names (variables) to impute missing values for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        """
        Calculates the median values for the specified numerical variables from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (NumericalImputer): The transformer instance.
        """
        self.median_dict_ = {}
        for var in self.variables:
            self.median_dict_[var] = X[var].median()
        return self


    def transform(self, X):
        """
        Imputes missing values in the specified numerical variables using the median values and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with missing values imputed for the specified numerical variables.
        """
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna(self.median_dict_[var])
        return X

In [9]:
class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to encode rare categories in categorical variables.

    Parameters:
        tol (float, optional): The tolerance level to define rare categories.
            Categories with a frequency lower than tol will be encoded as 'rare'.
            Default is 0.05.
        variables (list or str, optional): List of column names (variables) to encode rare categories for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        tol (float): The tolerance level to define rare categories.
        variables (list): List of column names (variables) to encode rare categories for.
        rare_labels_dict (dict): Dictionary to store the rare category labels for each specified categorical variable during fitting.

    Methods:
        fit(X, y=None):
            Calculates the rare category labels for the specified categorical variables from the training data.
            It returns the transformer instance itself.

        transform(X):
            Encodes rare categories in the specified categorical variables and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    encoder = RareLabelCategoricalEncoder(tol=0.1, variables=['category1', 'category2'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('encoder', encoder),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, tol=0.05, variables=None):
        """
        Initialize the RareLabelCategoricalEncoder transformer.

        Parameters:
            tol (float, optional): The tolerance level to define rare categories.
                Categories with a frequency lower than tol will be encoded as 'rare'.
                Default is 0.05.
            variables (list or str, optional): List of column names (variables) to encode rare categories for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        self.tol = tol
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        """
        Calculates the rare category labels for the specified categorical variables from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (RareLabelCategoricalEncoder): The transformer instance.
        """
        self.rare_labels_dict = {}
        for var in self.variables:
            t = pd.Series(X[var].value_counts() / float(X.shape[0]))
            self.rare_labels_dict[var] = list(t[t<self.tol].index)
        return self

    def transform(self, X):
        """
        Encodes rare categories in the specified categorical variables and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with rare categories encoded for the specified categorical variables.
        """
        X = X.copy()
        for var in self.variables:
            X[var] = np.where(X[var].isin(self.rare_labels_dict[var]), 'rare', X[var])
        return X

In [10]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to perform one-hot encoding for categorical variables.

    Parameters:
        variables (list or str, optional): List of column names (variables) to perform one-hot encoding for.
            If a single string is provided, it will be treated as a single variable. Default is None.

    Attributes:
        variables (list): List of column names (variables) to perform one-hot encoding for.
        dummies (list): List of column names representing the one-hot encoded dummy variables.

    Methods:
        fit(X, y=None):
            Calculates the one-hot encoded dummy variable columns for the specified categorical variables from the training data.
            It returns the transformer instance itself.

        transform(X):
            Performs one-hot encoding for the specified categorical variables and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    encoder = OneHotEncoder(variables=['category1', 'category2'])

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('encoder', encoder),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self, variables=None):
        """
        Initialize the OneHotEncoder transformer.

        Parameters:
            variables (list or str, optional): List of column names (variables) to perform one-hot encoding for.
                If a single string is provided, it will be treated as a single variable. Default is None.
        """
        self.variables = [variables] if not isinstance(variables, list) else variables

    def fit(self, X, y=None):
        """
        Calculates the one-hot encoded dummy variable columns for the specified categorical variables from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (OneHotEncoder): The transformer instance.
        """
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        """
        Performs one-hot encoding for the specified categorical variables and returns the modified DataFrame.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with one-hot encoded dummy variables for the specified categorical variables.
        """
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], axis=1)
        X.drop(self.variables, axis=1)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X

In [11]:
from sklearn.utils.validation import check_array

class OrderingFeatures(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn transformer to order features (columns) in the same order as they appeared in the training data.

    Parameters:
        None

    Attributes:
        ordered_features (pd.Index): Index of column names representing the order of features as they appeared in the training data.

    Methods:
        fit(X, y=None):
            Records the order of features from the training data and returns the transformer instance itself.

        transform(X):
            Reorders the features in the same order as they appeared in the training data and returns the modified DataFrame.

    Example usage:
    ```
    from sklearn.pipeline import Pipeline

    # Instantiate the custom transformer
    feature_orderer = OrderingFeatures()

    # Define the pipeline with the custom transformer
    pipeline = Pipeline([
        ('feature_orderer', feature_orderer),
        # Other pipeline steps...
    ])

    # Fit and transform the data using the pipeline
    X_transformed = pipeline.fit_transform(X)
    ```
    """
    def __init__(self):
        """
        Initialize the OrderingFeatures transformer.

        Parameters:
            None
        """
        return None

    def fit(self, X, y=None):
        """
        Records the order of features from the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            self (OrderingFeatures): The transformer instance.
        """
        X = check_array(X, accept_sparse=True)
        if isinstance(X, pd.DataFrame):
            self.ordered_features = X.columns
        elif isinstance(X, np.ndarray):
            self.ordered_features = np.arange(X.shape[1])
        else:
            raise ValueError("Input X must be a pandas DataFrame or a numpy array.")
        return self

    def transform(self, X):
        """
        Reorders the features in the same order as they appeared in the training data.

        Parameters:
            X (pd.DataFrame): Input data to be transformed.

        Returns:
            X_transformed (pd.DataFrame): Transformed DataFrame with features ordered as they appeared in the training data.
        """

        if isinstance(X, pd.DataFrame):
            print(X[self.ordered_features])
            return X[self.ordered_features]
        elif isinstance(X, np.ndarray):
            print(X[:, self.ordered_features])
            return X[:, self.ordered_features]
        else:
            raise ValueError("Input X must be a pandas DataFrame or a numpy array.")

# Pipeline
The code below is a scikit-learn pipeline called titanic_pipeline, that is used for data preprocessing and modeling for a Titanic dataset classification task. Each step in the pipeline corresponds to a specific data transformation or modeling step.

* **`MissingIndicator`**: This is a custom transformer that creates indicator features for missing values in numerical variables. It takes the NUMERICAL_VARS as input, which represents a list of numerical column names in the dataset.

* **`ExtractLetters`**: This is a custom transformer that extracts letters from the 'cabin' variable. It aims to process the 'cabin' variable and retrieve only the alphabetical characters, discarding any numeric or special characters.

* **`CategoricalImputer`**: This is a custom transformer that imputes missing values in categorical variables. It takes the CATEGORICAL_VARS_WITH_NA as input, which represents a list of categorical column names that may contain missing values. It fills in the missing values with the string 'Missing'.

* **`NumericalImputer`**: This is a custom transformer that imputes missing values in numerical variables. It takes the NUMERICAL_VARS_WITH_NA as input, which represents a list of numerical column names that may contain missing values. It fills in the missing values with the median value of each respective variable.

* **`RareLabelCategoricalEncoder`**: This is a custom transformer that encodes rare categories in categorical variables. It takes the CATEGORICAL_VARS as input, which represents a list of categorical column names to encode rare categories for. It identifies categories with a frequency lower than 5% (tolerance of 0.05) and encodes them as 'rare'.

* **`OneHotEncoder`**: This is a custom transformer that performs one-hot encoding for categorical variables. It takes the CATEGORICAL_VARS as input, which represents a list of categorical column names to be one-hot encoded. It creates binary dummy variables for each category.

* **`OrderingFeatures`**: This is a custom transformer that orders the features (columns) in the same order as they appeared in the training data. It ensures that the order of columns in the transformed dataset is consistent with the order in which the pipeline was trained.

* **`MinMaxScaler`**: This step scales the numerical features to a specified range, typically between 0 and 1, using the Min-Max scaling technique.

* **`LogisticRegression`**: This is the final modeling step in the pipeline. It fits a logistic regression model to the preprocessed dataset. The model is specified with hyperparameters C=0.0005, class_weight='balanced', and random_state=SEED_MODEL. The C parameter is the regularization strength, 'balanced' sets the class weights to be inversely proportional to the class frequencies to handle class imbalance, and random_state is used for reproducibility.

In [12]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

In [None]:
from sklearn.compose import ColumnTransformer

# Define the debug_print function to print DataFrame or array
def debug_print(X):
    if isinstance(X, pd.DataFrame):
        print(X.head())  # Print the first few rows of the DataFrame
    elif isinstance(X, np.ndarray):
        print(X[:5])  # Print the first 5 rows of the array

# Define the preprocessor for categorical variables
categorical_preprocessor = Pipeline([
    ('categorical_imputer', CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
    ('rare_labels', RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)),
    ('dummy_vars', OneHotEncoder(variables=CATEGORICAL_VARS))
])

# Define the preprocessor for numerical variables
numerical_preprocessor = Pipeline([
    ('missing_indicator', MissingIndicator(variables=NUMERICAL_VARS)),
    # ('cabin_only_letter', ExtractLetters()),
    ('median_imputation', NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
    ('scaling', MinMaxScaler())
])

# Use ColumnTransformer to apply the different preprocessors to their respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_preprocessor, CATEGORICAL_VARS),
        ('numerical', numerical_preprocessor, NUMERICAL_VARS)
    ]
)

# Combine the preprocessor with the logistic regression model in the final pipeline
titanic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('aligning_feats', OrderingFeatures()),
    ('log_reg', LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL))
])

# Debug each output after transformation
X_train_transformed = titanic_pipeline['preprocessor'].fit_transform(X_train)
debug_print(X_train_transformed)

X_train_transformed = titanic_pipeline['aligning_feats'].fit_transform(X_train_transformed)
debug_print(X_train_transformed)

# Fit the model
titanic_pipeline['log_reg'].fit(X_train_transformed, y_train)

In [None]:
titanic_pipeline = Pipeline(
                              [
                                ('missing_indicator', MissingIndicator(variables=NUMERICAL_VARS)),
                                ('cabin_only_letter', ExtractLetters()),
                                ('categorical_imputer', CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
                                ('median_imputation', NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
                                ('rare_labels', RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)),
                                ('dummy_vars', OneHotEncoder(variables=CATEGORICAL_VARS)),
                                ('aligning_feats', OrderingFeatures()),
                                ('scaling', MinMaxScaler()),
                                ('log_reg', LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL))
                              ])

In [None]:
# df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

# X_train, X_test, y_train, y_test = train_test_split(
#                                                         df.drop(TARGET, axis=1),
#                                                         df[TARGET],
#                                                         test_size=0.2,
#                                                         random_state=404
#                                                    )

In [None]:
# X_train


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,title
1162,3,male,,0,0,7.7500,Q,Mr
899,3,female,27.0,0,2,11.1333,S,Mrs
1006,3,female,,0,0,7.8792,Q,Miss
228,1,male,18.0,1,0,108.9000,C,Mr
573,2,female,27.0,0,0,10.5000,S,Miss
...,...,...,...,...,...,...,...,...
71,1,male,27.0,1,0,136.7792,C,Mr
609,3,male,26.0,0,0,8.0500,S,Mr
625,3,female,17.0,4,2,7.9250,S,Miss
1012,3,female,,0,0,7.7500,Q,Miss


In [None]:
# titanic_pipeline.fit(X_train, y_train)

In [None]:
class_pred = titanic_pipeline.predict(X_test)
proba_pred = titanic_pipeline.predict_proba(X_test)[:,1]
print('test roc-auc : {}'.format(roc_auc_score(y_test, proba_pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_pred)))
print()