# Workspace preparation

## Installing and Importing Add-ins and libraries


In [None]:
!poetry add scikit-learn
!poetry add seaborn
!poetry add numpy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from typing import Tuple



from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve,roc_auc_score, roc_curve, auc
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split

## Importing now our files from AWS

In [2]:
!aws s3 ls s3://zrive-ds-data/groceries/box_builder_dataset/ --recursive


2023-09-23 14:06:02  761678715 groceries/box_builder_dataset/feature_frame.csv


Then copying in local the file

In [3]:
!aws s3 cp s3://zrive-ds-data/groceries/box_builder_dataset/feature_frame.csv "/mnt/c/Users/Daniel Sánchez/Desktop/ZRIVE DS/src/module_3/"


download: s3://zrive-ds-data/groceries/box_builder_dataset/feature_frame.csv to ../../../../../../mnt/c/Users/Daniel Sánchez/Desktop/ZRIVE DS/src/module_3/feature_frame.csv


In [4]:
dfpath="/mnt/c/Users/Daniel Sánchez/Desktop/ZRIVE DS/src/module_3/feature_frame.csv"
df=pd.read_csv(dfpath)

## Understanding the DF

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
info_cols=["variant_id","order_id","user_id","created_at","order_date"]
label_col=["outcome"]
feature_cols=[col for col in df.columns if col not in info_cols +[label_col]]

In [None]:
categorical_cols=["product_type","vendor"]
binary_cols=["ordered_before","abandoned_before","active_snoozed","set_as_regular"]
numerical_cols =[col for col in feature_cols if col not in categorical_cols + binary_cols]

In [None]:
order_sizes = df.groupby('order_id')['outcome'].sum()
filtered_orders=order_sizes[order_sizes>5].index
df_filtered=df[df["order_id"].isin(filtered_orders)]



In [None]:
# Group by 'order_date' and count unique 'order_id's
daily_orders = df_filtered.groupby('order_date')['order_id'].nunique().reset_index()

# Rename the column for clarity
daily_orders = daily_orders.rename(columns={'order_id': 'unique_order_count'})
daily_orders['order_date'] = pd.to_datetime(daily_orders['order_date']).dt.date


In [None]:
daily_orders['cumsum_ratio'] = daily_orders['unique_order_count'].cumsum() / daily_orders['unique_order_count'].sum()


In [None]:
daily_orders['cumsum_ratio'].plot()

In [None]:
train_val_cutoff = daily_orders[daily_orders['cumsum_ratio'] <= 0.7].max()
val_test_cutoff= daily_orders[daily_orders['cumsum_ratio'] <= 0.9].max()



In [None]:
print('Training from',daily_orders['order_date'].min())
print("Train cutoff date:",train_val_cutoff['order_date'])
print("Validation cutoff date:",val_test_cutoff['order_date'])
print('Test until:',daily_orders['order_date'].max())


In [None]:
df_filtered.loc[:, 'order_date'] = pd.to_datetime(df_filtered['order_date']).dt.date

train_val_cutoff = train_val_cutoff.date() if hasattr(train_val_cutoff, 'date') else train_val_cutoff
val_test_cutoff = val_test_cutoff.date() if hasattr(val_test_cutoff, 'date') else val_test_cutoff



In [None]:
train_val_cutoff = pd.to_datetime(train_val_cutoff.iloc[0]).date()
val_test_cutoff = pd.to_datetime(val_test_cutoff.iloc[0]).date()


In [None]:
train_df = df_filtered[df_filtered['order_date'] <= train_val_cutoff]
val_df = df_filtered[(df_filtered['order_date'] > train_val_cutoff) & 
                     (df_filtered['order_date'] <= val_test_cutoff)]

test_df = df_filtered[df_filtered['order_date'] > val_test_cutoff]


# Baseline



As a first Baseline I am going to try to predict if something will be purchased (Outcome) based on the global popularity feature. 

In order to compare, we are going to start by preparing a function to plot Precision Recall and ROC curves, first for our baseline and then adding figures with our different models. 

The following function is for plting the curves. (Copy of Guille code)

In [None]:

X = train_df[['global_popularity']]
y = train_df['outcome']


y_pred_baseline = (X['global_popularity'])

# Compute ROC curve and AUC for the baseline
fpr, tpr, _ = roc_curve(y, y_pred_baseline)
roc_auc = auc(fpr, tpr)

# Compute Precision-Recall curve and AUC for the baseline
precision, recall, _ = precision_recall_curve(y, y_pred_baseline)
pr_auc = auc(recall, precision)

# Plot ROC and Precision-Recall curves
fig, ax = plt.subplots(1, 2, figsize=(14, 7))

# ROC Curve
ax[0].plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.2f}")
ax[0].plot([0, 1], [0, 1], "k--", label="Random")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].set_title("Baseline ROC Curve (Threshold on Global Popularity)")
ax[0].legend()

# Precision-Recall Curve
ax[1].plot(recall, precision, color="green", label=f"AUC = {pr_auc:.2f}")
ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].set_title("Baseline Precision-Recall Curve (Threshold on Global Popularity)")
ax[1].legend()

# Show the plots
plt.show()



# Model Training
## Ridge


We are going to start training our model. Its a good idea to start using our binary and numerical features. I have decided first of all to review if I can predict the outcome based on global popularity. For That, I am using Logistic Regression first of all with Ridge. 


In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_df[train_cols])
X_val_scaled = scaler.transform(val_df[train_cols])

# Define different values of C for comparison
C_values = [1e-10, 1e-6, 1e-1, 1, 1000, 1000000]

# Plot ROC and Precision-Recall curves for different values of C
fig, ax = plt.subplots(1, 2, figsize=(14, 7))

for C in C_values:
    # Fit the logistic regression model with L2 regularization (Ridge) and balanced class weights
    ridge_model = LogisticRegression(penalty="l2", C=C, class_weight="balanced")
    ridge_model.fit(X_train_scaled, train_df['outcome'])
    
    # Predict probabilities for validation set
    val_proba = ridge_model.predict_proba(X_val_scaled)[:, 1]
    
    # Compute ROC curve and AUC for validation set
    fpr_val, tpr_val, _ = roc_curve(val_df['outcome'], val_proba)
    roc_auc_val = auc(fpr_val, tpr_val)
    
    # Compute Precision-Recall curve and AUC for validation set
    precision_val, recall_val, _ = precision_recall_curve(val_df['outcome'], val_proba)
    pr_auc_val = auc(recall_val, precision_val)
    
    # Plot ROC Curve
    ax[0].plot(fpr_val, tpr_val, label=f"C = {C}, AUC = {roc_auc_val:.2f}")
    
    # Plot Precision-Recall Curve
    ax[1].plot(recall_val, precision_val, label=f"C = {C}, AUC = {pr_auc_val:.2f}")

# Customize ROC Curve plot
ax[0].plot([0, 1], [0, 1], "k--", label="Random")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].set_title("Validation ROC Curve - Logistic Regression (Ridge)")
ax[0].legend()

# Customize Precision-Recall Curve plot
ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].set_title("Validation Precision-Recall Curve - Logistic Regression (Ridge)")
ax[1].legend()

# Show the plots
plt.show()


In [None]:
val_df['outcome'].value_counts()


For some reason, this is the same than our baseline. So that should be that the ridge regression is not doing much here. We can appreciate that adjusting the level of regularisation in our model is not changing the AUC in the ROC curve, and also not changing much the precision-recall curve. 

Something looks off. I would expect some kind of change when changing the level of Regularisation. 


## Lasso

In [None]:
# Define different values of C for comparison
C_values = [1e-10, 1e-6, 1e-1, 1, 1000, 1000000]

# Plot ROC and Precision-Recall curves for different values of C
fig, ax = plt.subplots(1, 2, figsize=(14, 7))

for C in C_values:
    # Fit the logistic regression model with L1 regularization (Lasso)
    lasso_model = LogisticRegression(penalty="l1", C=C, solver="saga")
    lasso_model.fit(X_scaled, y)
    
    # Predict probabilities
    y_pred_proba_lasso = lasso_model.predict_proba(X_scaled)[:, 1]
    
    # Compute ROC curve and AUC
    fpr, tpr, _ = roc_curve(y, y_pred_proba_lasso)
    roc_auc = auc(fpr, tpr)
    
    # Compute Precision-Recall curve and AUC
    precision, recall, _ = precision_recall_curve(y, y_pred_proba_lasso)
    pr_auc = auc(recall, precision)
    
    # Plot ROC and Precision-Recall Curves
    ax[0].plot(fpr, tpr, label=f"C = {C}, AUC = {roc_auc:.2f}")
    ax[1].plot(recall, precision, label=f"C = {C}, AUC = {pr_auc:.2f}")

# Customize and show plots
ax[0].plot([0, 1], [0, 1], "k--", label="Random")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].set_title("ROC Curve - Logistic Regression (Lasso)")
ax[0].legend()

ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].set_title("Precision-Recall Curve - Logistic Regression (Lasso)")
ax[1].legend()

plt.show()



Here I am seing differences when changing the level of regularisation. What I do see is that very strong regularisation  (very small C), is giving me 0.5 AUC in the ROC curve, meaning that is not better than Random guessing. For that reason, I would select bigger values of C. Once again, a part from that, I am not seing any difference at all between my baseline and my Ridge or Lasso. 


# Summary and Conclusions

Only taking in my model Global Popularity to predict Outcome, and using Logistic Regression with Ridge and Lasso, is not he best way of building my model, since I am not getting any better prediction other than just using my baseline. 

For that reason, I would have to review few things: 
    - Am I using the correct features for my prediction?
    - Am I using using the correct models for my predictions?