# Pipeline

In [36]:
! pip install feature_engine==1.5.2
! pip install xgboost==1.7.5
! pip install tabpy
! pip install tabpy-client





In [21]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

from feature_engine.encoding import OneHotEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

from sklearn import preprocessing as pp
# set up the pipeline
# from feature_engine.transformation import YeoJohnsonTransformer

from sklearn.base import BaseEstimator, TransformerMixin

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.preprocessing import PowerTransformer

from sklearn.ensemble import RandomForestClassifier
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection

from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    balanced_accuracy_score,
    make_scorer, f1_score
)



from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    cross_validate,
    KFold,
)
### model evaluation
import xgboost as xgb



In [6]:
#load dataset
data = pd.read_csv('/Users/aprilm/data_science/data_tfm/nts_data.csv')
data.head()

Unnamed: 0,mode_main,distance,density,age,male,ethnicity,education,income,cars,license,bicycles,weekend,diversity,green,temp,precip,wind
0,walk,1.0,1.26259,84,no,native,lower,less20,0,yes,1,yes,1.24604,26.881233,0.1,0.1,3.0
1,walk,10.0,1.26259,84,no,native,lower,less20,0,yes,1,yes,1.24604,26.881233,0.1,0.1,3.0
2,car,3.0,1.76264,27,yes,western,middle,20to40,1,yes,2,yes,1.53959,36.045955,-3.4,0.05,1.8
3,car,3.0,1.76264,27,yes,western,middle,20to40,1,yes,2,yes,1.53959,36.045955,-3.4,0.05,1.8
4,car,61.5,1.76264,27,yes,western,middle,20to40,1,yes,2,yes,1.53959,36.045955,-3.4,0.05,1.8


### 1 Separación del dataset training y test

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('mode_main', axis=1),  # predictive variables
    data['mode_main'],  # target
    test_size=0.3,
    random_state=0,  # setting the seed to 0
   
)
X_train.shape, X_test.shape

((161425, 16), (69183, 16))

### 2. Target

In [8]:
# Let's enconde the target.
target_cat = {'car': 0, 'bike': 1, 'walk':2, 'pt':3 }

y_train = y_train.map(target_cat)
y_test = y_test.map(target_cat)

### 3. Pipeline - feature engineering 

In [9]:

# Numerical variables with  Yeo-Johson transformation
NUMERICALS_YEO_VARS = ['density', 'diversity', 'green', 'temp', 'wind']

# Numerical variables with log transformation
NUMERICALS_LOG_VARS = ["age", "distance"]

# categorical variables for encode
TWO_CAT = {'yes':1, 'no':0}
TWO_VAR = ['male', 'license', 'weekend']


# categorical for one hot encoding
ONE_HOT_ENCODE_VARS = ['education']

#variable mapping
INCOME_MAPPING = {'less20': 0, '20to40': 1, 'more40':2}


from sklearn.preprocessing import PowerTransformer

# Custom transformer class
class YeoJohnsonTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        # Initialize a PowerTransformer for each variable
        self.transformers_ = {}
        
        for var in self.variables:
            transformer = PowerTransformer(method="yeo-johnson", standardize=False)
            transformer.fit(X[[var]])
            self.transformers_[var] = transformer
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        
        for var in self.variables:
            var_tr = var + '_tr'
            X[var_tr] = self.transformers_[var].transform(X[[var]])
        
        return X


    
class LogTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables):
        self.variables = variables
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var + '_tr'] = np.log(X[var])
        return X

### ====== CATEGORICAL TRANSFORMATIONS ======
# For Ethnicity
class EthnicityBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, variable):
        self.variable = variable
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        
        # Grouping less frequent categories
        X[self.variable + '_bin'] = X[self.variable].replace(['western', 'nonwestern'], 'western_nonwestern')
        
        # Mapping to binary values
        ethnicity_cat = {'native': 0, 'western_nonwestern': 1}
        X[self.variable + '_bin'] = X[self.variable + '_bin'].map(ethnicity_cat)
        
        # Dropping the original 'ethnicity' column
        X = X.drop(self.variable, axis=1)
        
        return X
    
# Transformer for 'license', 'weekend', and 'male' and 'income'
class Mapper(BaseEstimator, TransformerMixin):
    def __init__(self, variables, mappings, drop_original=True):
        self.variables = variables
        self.mappings = mappings
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            new_col = feature + "_bin" if feature in ["male", "license", "weekend"] else feature + "_ord"
            X[new_col] = X[feature].map(self.mappings)
            if self.drop_original:
                X = X.drop(feature, axis=1)
        return X
    
#========= BINARIZE PRECIPITATION ==========

# Modifying the BinarizeVariable transformer to retain the original 'precip' column
class BinarizeVariable(BaseEstimator, TransformerMixin):
    
    def __init__(self, variable, value_map=None, threshold=None):
        self.variable = variable
        self.value_map = value_map
        self.threshold = threshold
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        
        if self.value_map:
            X[self.variable + '_bin'] = X[self.variable].map(self.value_map)
        elif self.threshold is not None:
            X[self.variable + '_bin'] = np.where(X[self.variable] > self.threshold, 1, 0)
        
        return X


In [10]:
fe_pipe = Pipeline([
    
    # ==== VARIABLE TRANSFORMATION =====
    # Apply the column-specific transformations
    
    ('yeo_johnson', YeoJohnsonTransformer(variables=NUMERICALS_YEO_VARS)),
    ('tranformation_log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
     # == CATEGORICAL ENCODING
    ('binarize_precip', BinarizeVariable(variable='precip', threshold=0)),
    
     # === mappers ===
    # Binarize specified variables
    ('cat_variable', Mapper(variables=TWO_VAR, mappings=TWO_CAT)),

     # Handle 'income' variable

    ('ordinal_encoder', Mapper(variables=['income'], mappings=INCOME_MAPPING)),

    # Handle 'ethnicity' variable
    ('ethnicity_binarizer', EthnicityBinarizer(variable='ethnicity')),
    
    # == CATEGORICAL ENCODING
    #one hot encoding
    ('one_hot_encode', OneHotEncoder(
        variables=ONE_HOT_ENCODE_VARS, drop_last=True)),
   
])


In [11]:
X_train_transformed = fe_pipe.fit_transform(X_train)
X_test_transformed = fe_pipe.transform(X_test)

In [12]:
print(X_train_transformed.shape, X_test_transformed.shape, y_train.shape, y_test.shape)

(161425, 25) (69183, 25) (161425,) (69183,)


In [13]:
X_train_transformed.columns

Index(['distance', 'density', 'age', 'cars', 'bicycles', 'diversity', 'green',
       'temp', 'precip', 'wind', 'density_tr', 'diversity_tr', 'green_tr',
       'temp_tr', 'wind_tr', 'age_tr', 'distance_tr', 'precip_bin', 'male_bin',
       'license_bin', 'weekend_bin', 'income_ord', 'ethnicity_bin',
       'education_higher', 'education_lower'],
      dtype='object')

### 4. Equilibro de dataset

In [14]:
# Instantiate SMOTEENN
smenn = SMOTEENN(sampling_strategy='auto',random_state=0,
        smote=SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5),
        enn=EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all'),n_jobs=4)

# Apply transformations
X_train_balanced, y_train_balanced = smenn.fit_resample(X_train_transformed, y_train)



In [15]:
X_train_balanced.columns

Index(['distance', 'density', 'age', 'cars', 'bicycles', 'diversity', 'green',
       'temp', 'precip', 'wind', 'density_tr', 'diversity_tr', 'green_tr',
       'temp_tr', 'wind_tr', 'age_tr', 'distance_tr', 'precip_bin', 'male_bin',
       'license_bin', 'weekend_bin', 'income_ord', 'ethnicity_bin',
       'education_higher', 'education_lower'],
      dtype='object')

### 5. Feature selection

In [16]:

# Define your random forest estimator
rf = RandomForestClassifier(
    n_estimators=10,
    random_state=0,
    n_jobs=4,
)

# Stack all the selection methods inside a pipeline
pipe_model_performance = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('duplicated', DropDuplicateFeatures()),
    ('correlation_model_performance', SmartCorrelatedSelection(
        method='spearman',
        missing_values='raise',
        scoring='f1_macro',
        selection_method='model_performance',
        estimator=rf,
        cv=3,
    )),
])

# Fit the pipeline on the training data
pipe_model_performance.fit(X_train_balanced, y_train_balanced)

# Transform the test data using the already fitted pipeline
X_train_selected = pipe_model_performance.transform(X_train_balanced)
X_test_selected = pipe_model_performance.transform(X_test_transformed)


In [17]:
X_train_selected.columns

Index(['density', 'cars', 'bicycles', 'diversity', 'green', 'precip',
       'temp_tr', 'wind_tr', 'age_tr', 'distance_tr', 'precip_bin', 'male_bin',
       'license_bin', 'weekend_bin', 'income_ord', 'ethnicity_bin',
       'education_higher', 'education_lower'],
      dtype='object')

### 6 Entrenamiento del modelo

In [19]:
# # Define the model with the given hyperparameters
# xgb_model = xgb.XGBClassifier(
#     objective='multi:softprob',
#     colsample_bytree=0.9,
#     learning_rate=0.3,
#     max_depth=10,
#     n_estimators=1000,  # Setting to a large number, early stopping will decide the actual number
#     eval_metric='mlogloss',
#     subsample=1.0,
#     reg_lambda=1.5,
#     reg_alpha=0.005,
#     random_state=0
# )

# # Stratified K-Fold Cross-Validation
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

# # Scores list to store the F1 macro scores for each fold
# scores = []

# # Loop over the folds
# for train_idx, val_idx in skf.split(X_train_selected, y_train_balanced):
#     X_train_fold, X_val_fold = X_train_selected.iloc[train_idx], X_train_selected.iloc[val_idx]
#     y_train_fold, y_val_fold = y_train_balanced.iloc[train_idx], y_train_balanced.iloc[val_idx]

#     # Fit the model with early stopping
#     xgb_model.fit(
#         X_train_fold,
#         y_train_fold,
#         early_stopping_rounds=10,
#         eval_set=[(X_val_fold, y_val_fold)],
#         verbose=False
#     )

#     # Predict on the validation set
#     predictions = xgb_model.predict(X_val_fold)
#     score = f1_score(y_val_fold, predictions, average='macro')
#     scores.append(score)

# # Print results
# print(f"Validation F1 (macro) across folds: {[score * 100 for score in scores]}")
# print(f"Average Validation F1 (macro): {np.mean(scores) * 100:.2f}%")

# # Predict on the entire training data to get the training F1 score
# y_train_pred = xgb_model.predict(X_train_selected)
# train_f1 = f1_score(y_train_balanced, y_train_pred, average='macro')
# print(f"Training F1 (macro): {train_f1 * 100:.2f}%")

# # Predict on the test data and get the test F1 score
# y_test_pred = xgb_model.predict(X_test_selected)
# test_f1 = f1_score(y_test, y_test_pred, average='macro')
# print(f"Test F1 (macro): {test_f1 * 100:.2f}%")


In [22]:
# Define the model with the given hyperparameters
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    colsample_bytree=0.9,
    learning_rate=0.3,
    max_depth=10,
    n_estimators=1000,  # Setting to a large number, early stopping will decide the actual number
    eval_metric='mlogloss',
    subsample=1.0,
    reg_lambda=1.5,
    reg_alpha=0.005,
    random_state=0
)
    
#Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
    
# Initialize lists to store precision and recall scores for each fold
precisions = []
recalls = []
scores = [] 

# Loop over the folds
for fold_num, (train_idx, val_idx) in enumerate(skf.split(X_train_selected, y_train_balanced), start=1):
    X_train_fold, X_val_fold = X_train_selected.iloc[train_idx], X_train_selected.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_balanced.iloc[train_idx], y_train_balanced.iloc[val_idx]
    
    # Fit the model with early stopping
    xgb_model.fit(
        X_train_fold,
        y_train_fold,
        early_stopping_rounds=10,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=False
    )
    
    # Predict on the validation set
    predictions = xgb_model.predict(X_val_fold)
    
    # Calculate Precision, Recall, and F1 Score for the validation set
    precision = precision_score(y_val_fold, predictions, average='macro')
    recall = recall_score(y_val_fold, predictions, average='macro')
    score = f1_score(y_val_fold, predictions, average='macro')
    
    # Append the scores to the respective lists
    scores.append(score)
    precisions.append(precision)
    recalls.append(recall)
    
    # Print results for the current fold
    print(f"Fold {fold_num}:")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1 Score: {score * 100:.2f}%")
    print()

# Calculate and print Precision, Recall, and F1 Score for the entire training set
y_train_pred = xgb_model.predict(X_train_selected)
train_precision = precision_score(y_train_balanced, y_train_pred, average='macro')
train_recall = recall_score(y_train_balanced, y_train_pred, average='macro')
train_f1 = f1_score(y_train_balanced, y_train_pred, average='macro')

print(f"Training Precision: {train_precision * 100:.2f}%")
print(f"Training Recall: {train_recall * 100:.2f}%")
print(f"Training F1: {train_f1 * 100:.2f}%")

# Calculate and print Precision, Recall, and F1 Score for the test set
y_test_pred = xgb_model.predict(X_test_selected)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"Test Precision: {test_precision * 100:.2f}%")
print(f"Test Recall: {test_recall * 100:.2f}%")
print(f"Test F1: {test_f1 * 100:.2f}%")

# Print average results across folds
print(f"Average Validation Precision (macro): {np.mean(precisions) * 100:.2f}%")
print(f"Average Validation Recall (macro): {np.mean(recalls) * 100:.2f}%")
print(f"Average Validation F1 (macro): {np.mean(scores) * 100:.2f}%")    



Fold 1:
Precision: 93.65%
Recall: 93.54%
F1 Score: 93.59%





Fold 2:
Precision: 93.73%
Recall: 93.60%
F1 Score: 93.66%





Fold 3:
Precision: 93.66%
Recall: 93.54%
F1 Score: 93.60%





Fold 4:
Precision: 93.50%
Recall: 93.41%
F1 Score: 93.45%





Fold 5:
Precision: 93.65%
Recall: 93.55%
F1 Score: 93.60%

Training Precision: 98.70%
Training Recall: 98.67%
Training F1: 98.68%
Test Precision: 74.36%
Test Recall: 67.76%
Test F1: 70.32%
Average Validation Precision (macro): 93.64%
Average Validation Recall (macro): 93.53%
Average Validation F1 (macro): 93.58%
