# Full pipeline

Pipeline includes preprocess the data, feature extraction, train model, evaluate model and predict


## Preprocessing steps


### Merge all datasets


In [1]:
from pandas import DataFrame
from src.main.preprocess_data import merge_data

merged_data: DataFrame = merge_data(
    "data/raw/attribution_path_data.feather", "data/raw/user_feature_data.feather"
)
merged_data.head(5)

Unnamed: 0,path_id,device_type,attribution_channel,distance_to_last,time_delta_in_days,has_booked,n_sessions,most_common_landing_page,clicked_city,viewed_city,viewed_beach,saw_brand,saw_organic,saw_direct,saw_offer_summary,saw_panda,adults,children,nights
0,4c56d801d41290b7b204b55e1565689f,2,1,1,0,1,6,Search,False,False,False,False,False,True,True,True,6.0,0.333333,0.666667
1,4c56d801d41290b7b204b55e1565689f,2,9,6,1,1,6,Search,False,False,False,False,False,True,True,True,6.0,0.333333,0.666667
2,4c56d801d41290b7b204b55e1565689f,2,9,4,0,1,6,Search,False,False,False,False,False,True,True,True,6.0,0.333333,0.666667
3,4c56d801d41290b7b204b55e1565689f,2,9,2,0,1,6,Search,False,False,False,False,False,True,True,True,6.0,0.333333,0.666667
4,4c56d801d41290b7b204b55e1565689f,2,9,7,1,1,6,Search,False,False,False,False,False,True,True,True,6.0,0.333333,0.666667


In [2]:
# Group the data by path ID and aggregate the features
grouped_data: DataFrame = merged_data.groupby("path_id").agg(
    {
        "device_type": lambda x: x.mode().iloc[0],
        "attribution_channel": lambda x: x.mode().iloc[0],
        "distance_to_last": "max",
        "time_delta_in_days": "max",
        "has_booked": "max",
        "n_sessions": "max",
        "most_common_landing_page": lambda x: x.mode().iloc[0],
        "clicked_city": "max",
        "viewed_city": "max",
        "viewed_beach": "max",
        "saw_brand": "max",
        "saw_organic": "max",
        "saw_direct": "max",
        "saw_offer_summary": "max",
        "saw_panda": "max",
        "adults": "max",
        "children": "max",
        "nights": "max",
    }
)
grouped_data

Unnamed: 0_level_0,device_type,attribution_channel,distance_to_last,time_delta_in_days,has_booked,n_sessions,most_common_landing_page,clicked_city,viewed_city,viewed_beach,saw_brand,saw_organic,saw_direct,saw_offer_summary,saw_panda,adults,children,nights
path_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00003d039958362817073f4c9448ff34,3,7,2,1,0,1,Search,False,False,False,False,False,False,False,True,6.0,0.0,1.0
00004fcd09bcf6bccb16269538032578,3,4,2,2,0,1,SEO,,,,False,False,False,False,False,0.0,0.0,0.0
0000fc42234be9ec2abcbb4c71b4bae4,3,17,3,1,0,2,Homepage,False,False,False,True,False,False,True,True,1.0,0.0,1.0
000148e9e616d1a4f03c92f566f37b39,3,7,2,19,0,1,Hotel Details,,False,False,False,False,False,False,True,2.0,0.0,1.0
00014d81cd6599192b2446c4c65a29c0,5,21,2,0,0,1,Hotel Details,,False,False,False,False,False,False,False,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffeb83084cd63bf98f06a9831713ade,5,21,2,0,0,1,SEO,,,,False,True,False,False,False,0.0,0.0,0.0
fffecfee29bb3142a438092ee6ffd638,2,2,4,4,1,3,Homepage,False,False,False,True,False,False,True,True,2.0,0.0,1.0
fffedd1c982de9688cc6bd78a8a8a399,3,9,2,0,0,1,SEO,False,,,False,False,False,False,False,0.0,0.0,0.0
ffff36f1ceada40b90f268dd590460ad,3,17,2,0,0,1,Homepage,False,False,False,True,False,False,False,True,2.0,1.0,1.0


### Splitting some data for prediction purpose.

Splitting data for prediction purpose in last step as use this data as new data points.


In [3]:
from sklearn.model_selection import train_test_split
from pandas import DataFrame, concat

X: DataFrame = grouped_data.drop("has_booked", axis=1)
y = grouped_data["has_booked"].astype(int)
X_train, X_for_pred, y_train, y_for_pred = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Combine X and y back into a single DataFrame
combined_df: DataFrame = concat([X_train, y_train], axis=1)

### Preprocess Data

Preprocess data by cleaning, transforming categorical features to numerical values and imputing NA values.

#### Note:

    - The function modifies the input DataFrame in place and return processed DataFrame.
    - If save_path is provided, the preprocessed data will be saved as a Feather file.


In [4]:
from src.main.preprocess_data import preprocess_data

processed_data: DataFrame = preprocess_data(combined_df)

---------- Preprocessing data -----------
Dropping unnecessary features: ['clicked_city', 'viewed_city', 'viewed_beach']


### Feature Selection

#### Note:

    - Adjust the list of features to drop, based on the specific features needed for model training.


In [5]:
from src.main.feature_selection import feature_selection, train_val_test_split

featured_data: DataFrame = feature_selection(processed_data)

---------- Feature selection -----------
Dropping unnecessary features: ['time_delta_in_days', 'distance_to_last']


In [6]:
featured_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182480 entries, 0 to 182479
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   device_type               182480 non-null  float64
 1   attribution_channel       182480 non-null  float64
 2   n_sessions                182480 non-null  float64
 3   most_common_landing_page  182480 non-null  float64
 4   saw_brand                 182480 non-null  float64
 5   saw_organic               182480 non-null  float64
 6   saw_direct                182480 non-null  float64
 7   saw_offer_summary         182480 non-null  float64
 8   saw_panda                 182480 non-null  float64
 9   adults                    182480 non-null  float64
 10  children                  182480 non-null  float64
 11  nights                    182480 non-null  float64
 12  has_booked                182480 non-null  float64
 13  time_to_book_week         182480 non-null  f

In [7]:
featured_data

Unnamed: 0,device_type,attribution_channel,n_sessions,most_common_landing_page,saw_brand,saw_organic,saw_direct,saw_offer_summary,saw_panda,adults,children,nights,has_booked,time_to_book_week
0,3.0,9.0,1.0,10.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000
1,3.0,20.0,3.0,17.0,0.0,0.0,0.0,0.0,1.0,0.088889,0.00000,0.666667,0.0,0.266667
2,3.0,18.0,36.0,17.0,1.0,1.0,1.0,1.0,1.0,0.048148,0.00000,0.555556,1.0,0.855556
3,3.0,9.0,2.0,12.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.500000
4,3.0,21.0,6.0,11.0,1.0,0.0,0.0,1.0,1.0,0.088889,0.06250,1.000000,0.0,0.011111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182475,3.0,31.0,1.0,19.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.000000
182476,2.0,9.0,4.0,12.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.088889
182477,3.0,9.0,8.0,10.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.00000,0.000000,0.0,0.833333
182478,3.0,20.0,4.0,17.0,0.0,1.0,0.0,0.0,1.0,0.033333,0.03125,0.250000,0.0,0.122222


#### Splitting data for training, validation, and testing.


In [8]:
X_train, X_test, X_val, y_train, y_test, y_val = train_val_test_split(featured_data)

## Train model

Train the model and save the model.

#### Note:

    - The function uses the following models: Logistic Regression, Random Forest, XGBoost, LSTM.
    - As this is classification task so those basic classification models are chosen for training.


In [9]:
from src.main.train_model_optimized import train_models

train_models(X_train, y_train)

Training Logistic Regression with GridSearchCV...




Training Random Forest Classifier with GridSearchCV...
Training XGBoost with GridSearchCV...
Training LSTM with GridSearchCV...


2023-11-29 14:50:26.090389: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-11-29 14:50:26.090415: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-11-29 14:50:26.090420: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-11-29 14:50:26.090454: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-29 14:50:26.090470: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-11-29 14:50:27.008254: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Load model

Loads the model as per its extension.

#### Note:

    - Supports loading models saved with joblib (.joblib), pickle (.pkl), and Keras (.keras) formats.


In [10]:
from src.main.load_model import load_model_from_file

models = [
    ("RandomForest", "models/model_rf.joblib"),
    ("LogisticRegression", "models/model_lr.joblib"),
    ("XGBoost", "models/model_xgboost.pkl"),
    ("LSTM", "models/model_lstm.keras"),
]

# Load models into a list
loaded_models = [load_model_from_file(file_path) for _, file_path in models]

In [11]:
print(loaded_models)

[RandomForestClassifier(class_weight={0: 0.5586237575456084,
                                     1: 4.764482702349869},
                       n_estimators=200), LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear'), XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...), <keras.src

## Evaluate Model

#### Note:

    - Uses binary classification for the evaluation metrics.
    - Uses specificity scores for model comparison.
    - Returns the best model based on the highest specificity score.

SpecificityScore <- Because it will return correctly the true number of people who will not book and we can target them with the marketing strategy.


In [12]:
from src.main.evaluate_model import model_selection, get_scores

metric = "f1"
model = model_selection(loaded_models, X_val, y_val, metric)

print(
    f"\n The best model suits the data by highest {metric} metrics is: {model.__class__.__name__}"
)


 -------RandomForestClassifier-------
Unique Predictions: [0 1]
Specificity: 0.9575648017648141
Accuracy: 0.8825076720736519
F1 Score: 0.30816392384640207

 -------LogisticRegression-------
Unique Predictions: [0 1]
Specificity: 0.9891537471658802
Accuracy: 0.9024002630425252
F1 Score: 0.26737967914438504

 -------XGBClassifier-------
Unique Predictions: [0 1]
Specificity: 0.9832097554997242
Accuracy: 0.9043456817185445
F1 Score: 0.34392031573012594

 -------Sequential-------
Reshape for LSTM
Unique Predictions: [0 1]
Specificity: 0.7376371101170415
Accuracy: 0.7464105655414291
F1 Score: 0.40623596586899335
{RandomForestClassifier(class_weight={0: 0.5586237575456084,
                                     1: 4.764482702349869},
                       n_estimators=200): 0.30816392384640207, LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear'): 0.26737967914438504, XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsam

## Confusion Matrix for Model


In [13]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(4, 4))
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    xticklabels=["Predicted 0", "Predicted 1"],
    yticklabels=["Actual 0", "Actual 1"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

ValueError: in user code:

    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/bhaskar.saikia/Library/Caches/pypoetry/virtualenvs/senior_data_scientist_task-lHbaO3K8-py3.11/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_13" is incompatible with the layer: expected shape=(None, 1, 13), found shape=(None, 13)


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred, normalize="all")

# Plot Confusion Matrix
plt.figure(figsize=(4, 4))
sns.heatmap(
    conf_matrix,
    annot=True,
    cmap="Blues",
    fmt=".2%",
    cbar=False,
    xticklabels=["Predicted 0", "Predicted 1"],
    yticklabels=["Actual 0", "Actual 1"],
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Percentage)")
plt.show()

## Feature Importance Analysis


In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature_names = X_train.columns.to_list()

# Get feature importances
feature_importances = model.feature_importances_


feature_names_sorted = np.array(feature_names)[np.argsort(feature_importances)[::-1]]
feature_importances_sorted = np.sort(feature_importances)[::-1]

# Create a dictionary mapping features to colors
feature_color_mapping = {
    "n_sessions": "green",
    "attribution_channel": "green",
    "device_type": "green",
    "time_to_book_week": "green",
}

# Plot the sorted feature importances with specific colors
plt.bar(
    range(len(feature_importances_sorted)),
    feature_importances_sorted,
    tick_label=feature_names_sorted,
    color=[
        feature_color_mapping.get(feature, "#87CEEB")
        for feature in feature_names_sorted
    ],
)

plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.xticks(rotation=90)  # Set x-axis labels to vertical
plt.show()

#### Set Threshold for new data

Based on the test evaluation and desired performance criteria, a threshold is set to determine the model's decision boundary for classifying future data points.
And we want to improve the metrics by retraining if it falls below the threshold.


In [None]:
test_specificity, test_acc, test_f1 = get_scores(model, X_test, y_test)
threshold = test_specificity
print(f"Threshold for new {model.__class__.__name__} model: {threshold}")

## Predict on new data

Predicting on assumed new data which we have already splitted from the original dataset.

#### Note:

    - Preprocess the input data using the `preprocess_data` function.
    - Performs feature selection using the `feature_selection` function.
    - Reshapes input data for LSTM models before making predictions.
    - For non-LSTM models, uses the `predict` method directly.


In [None]:
from src.main.predict import predict
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

predict_new_y = predict(model, X_for_pred)

conf_matrix = confusion_matrix(y_for_pred, predict_new_y)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)
accuracy = accuracy_score(y_for_pred, predict_new_y)
f1 = f1_score(y_for_pred, predict_new_y)

print(f"Specificity: {specificity}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

## Pipeline insight and further improvement

`Train`: Different model algorithms are trained on the provided dataset.

`Evaluate`: Each model's performance is assessed using the validation set and predetermined metrics like accuracy, precision, recall, and F1-score.

`Select`: The model with the best scores on those metrics is chosen for final evaluation.

`Test`: The chosen model is evaluated on the previously unseen test set to get final scores on the chosen metrics.

`Threshold`: Based on the test evaluation and desired performance criteria, a threshold is set to determine the model's decision boundary for classifying future data points.

`Retrain`: If further optimization is required, retraining is performed with adjusted parameters or model selection to improve performance and achieve desired scores on the chosen metrics.
This workflow ensures a robust model selection process with accurate performance evaluation and a reliable threshold for future predictions.

### Improvement

`Train`: Train different model algorithms and hyperparameter tuning to find the best configuration for model using cross validation method and GridSearch.[Unfortunately my PC doesn't allow me to train with that configured model.]
