# Instacart market basket analysis

## Data Mining and analysis CBD-3334_1

* Andrea Franco - C0931897

# Libraries

In [2]:
import pandas as pd


# Exploratory Data Analysis

In [None]:
aisles_df = pd.read_csv("market_basket_analysis/datasets/original/aisles.csv")
order_prods_prior_df = pd.read_csv("market_basket_analysis/datasets/original/order_products__prior.csv")
order_prods_train_df = pd.read_csv("market_basket_analysis/datasets/original/order_products__train.csv")
orders_df = pd.read_csv("market_basket_analysis/datasets/original/orders.csv")
products_df = pd.read_csv("market_basket_analysis/datasets/original/products.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'market_basket_analysis/datasets/original/aisles.csv'

In [None]:
print("aisles shape: ", aisles_df.shape)
print("order_prods_prior shape: ", order_prods_prior_df.shape)
print("order_prods_train shape: ", order_prods_train_df.shape)
print("orders shape: ", orders_df.shape)
print("products shape: ", products_df.shape)

In [None]:
aisles_df.head()

Reduce the number of aisles

In [None]:
aisles_df['aisle'].unique()

Map into 5 big categories

In [None]:
def categorize_aisle(aisle):
    fresh_foods = ['fresh fruits', 'fresh vegetables', 'fresh herbs', 'poultry counter', 'seafood counter', 'meat counter', 'fresh dips tapenades']
    packaged_foods = ['energy granola bars', 'instant foods', 'bakery desserts', 'pasta sauce', 'baking ingredients', 'bulk dried fruits vegetables', 'popcorn jerky', 'candy chocolate', 'cookies cakes', 'crackers', 'chips pretzels', 'cereal', 'dry pasta', 'grains rice dried goods', 'packaged vegetables fruits', 'trail mix snack mix', 'soup broth bouillon', 'canned meals beans', 'canned jarred vegetables', 'canned meat seafood', 'canned fruit applesauce', 'spices seasonings', 'condiments', 'granola', 'preserved dips spreads', 'salad dressing toppings']
    beverages = ['coffee', 'tea', 'juice nectars', 'soft drinks', 'water seltzer sparkling water', 'energy sports drinks', 'protein meal replacements', 'beers coolers', 'red wines', 'white wines', 'spirits', 'specialty wines champagnes', 'cocoa drink mixes', 'frozen juice']
    household_care = ['kitchen supplies', 'oral hygiene', 'soap', 'paper goods', 'shave needs', 'diapers wipes', 'trash bags liners', 'eye ear care', 'vitamins supplements', 'facial care', 'dish detergents', 'laundry', 'deodorants', 'air fresheners candles', 'baby bath body care', 'skin care', 'plates bowls cups flatware', 'cleaning products', 'first aid', 'feminine care', 'body lotions soap', 'muscles joints pain relief', 'beauty']
    frozen_refrigerated = ['frozen meat seafood', 'frozen meals', 'frozen vegan vegetarian', 'frozen breads doughs', 'frozen breakfast', 'ice cream ice', 'ice cream toppings', 'refrigerated', 'milk', 'yogurt', 'butter', 'cream', 'soy lactosefree', 'refrigerated pudding desserts', 'frozen produce', 'frozen pizza', 'frozen appetizers sides', 'frozen dessert']

    if aisle in fresh_foods:
        return 'Fresh Foods'
    elif aisle in packaged_foods:
        return 'Packaged Foods'
    elif aisle in beverages:
        return 'Beverages'
    elif aisle in household_care:
        return 'Household & Personal Care'
    elif aisle in frozen_refrigerated:
        return 'Frozen & Refrigerated Items'
    else:
        return 'Other'

# Apply the function to your dataframe
aisles_df['aisle_category'] = aisles_df['aisle'].apply(categorize_aisle)


In [None]:
aisles_df.head()

In [None]:
aisles_df['aisle_category'].value_counts()

In [None]:
order_prods_prior_df.head()

In [None]:
orders_df.head()

In [None]:
products_df.head()

In [None]:
departments_df = pd.read_csv("market_basket_analysis/datasets/original/departments.csv")
departments_df.head()

# 2. Merge datasets

### Order-Product Details:
First, we merge the order_products data with orders to attach order-specific information to each product ordered.

In [None]:
order_products_merged = order_prods_prior_df.merge(orders_df, on='order_id', how='left')

In [None]:
order_products_merged.head()

### Add Product Information:
Merge the order_products_merged with products to attach product-specific details.

In [None]:
order_products_merged = order_products_merged.merge(products_df, on='product_id', how='left')


In [None]:
order_products_merged.head()

### Add Aisle and Department Information:
Next, merge order_products_merged with aisles and departments to include aisle and department names.

In [None]:
order_products_merged = order_products_merged.merge(aisles_df, on='aisle_id', how='left')
order_products_merged = order_products_merged.merge(departments_df, on='department_id', how='left')


In [None]:
order_products_merged.head()

In [None]:
order_products_merged.info()

In [None]:
order_products_merged.isnull().sum()

# Data engineering

## Average Days Between Purchases:

We can calculate the average days_since_prior_order for each user to capture their shopping frequency using the user_id


In [None]:
avg_days_between = order_products_merged.groupby('user_id')['days_since_prior_order'].mean().reset_index()
avg_days_between.columns = ['user_id', 'avg_days_between_purchases']


In [None]:
avg_days_between.head()

In [None]:
avg_days_between.isnull().sum()

## Total Number of Orders:
Count the number of unique order_id values for each user_id.


In [None]:
total_orders = order_products_merged.groupby('user_id')['order_id'].nunique().reset_index()
total_orders.columns = ['user_id', 'total_orders']


In [None]:
total_orders.head()

## Average Number of Items per Order:
Average count of product_id per order_id for each user.


In [None]:
items_per_order = order_products_merged.groupby(['user_id', 'order_id'])['product_id'].count().reset_index()
avg_items_per_order = items_per_order.groupby('user_id')['product_id'].mean().reset_index()
avg_items_per_order.columns = ['user_id', 'avg_items_per_order']


In [None]:
avg_items_per_order.head()

## Most Frequent Day of the Week for Orders:
Most common order_dow (day of the week) for each user.



In [None]:
most_freq_dow = order_products_merged.groupby('user_id')['order_dow'].agg(lambda x: x.mode()[0]).reset_index()
most_freq_dow.columns = ['user_id', 'most_freq_day']


In [None]:
most_freq_dow.head()

## Most Frequent Hour of the Day for Orders:
Mode of order_hour_of_day for each user.

In [None]:
most_freq_hour = order_products_merged.groupby('user_id')['order_hour_of_day'].agg(lambda x: x.mode()[0]).reset_index()
most_freq_hour.columns = ['user_id', 'most_freq_hour']

In [None]:
most_freq_hour.head()

## Average Time Between Orders for Specific Products:
For each user-product pair, the average days_since_prior_order.


In [None]:
avg_days_product = order_products_merged.groupby(['user_id', 'product_id'])['days_since_prior_order'].mean().reset_index()
avg_days_product.columns = ['user_id', 'product_id', 'avg_days_product_purchase']


In [None]:
avg_days_product.head()

## Percentage of Orders with a Specific Product:

How often each product appears in orders as a proportion of the user’s total orders.



In [None]:
product_orders = order_products_merged.groupby(['user_id', 'product_id'])['order_id'].nunique().reset_index()
total_orders = order_products_merged.groupby('user_id')['order_id'].nunique().reset_index()
merged = product_orders.merge(total_orders, on='user_id', suffixes=('_product', '_total'))
merged['product_order_ratio'] = merged['order_id_product'] / merged['order_id_total']


In [None]:
merged.head()

In [None]:
product_order_ratio = merged.copy()

## Recency of Last Purchase for Each Product:

 Identify the last order number for each product and calculate the difference from the user’s most recent order.


In [None]:
last_purchase = order_products_merged.groupby(['user_id', 'product_id'])['order_number'].max().reset_index()
recent_order = order_products_merged.groupby('user_id')['order_number'].max().reset_index()
merged = last_purchase.merge(recent_order, on='user_id', suffixes=('_product', '_recent'))
merged['recency_last_purchase'] = merged['order_number_recent'] - merged['order_number_product']


In [None]:
recency_last_purchase = merged.copy()

In [None]:
recency_last_purchase.head()

## Reorder Ratio for Each Product:

Calculate the reorder rate as the ratio of times a product was reordered to the total times it was ordered.

Source Features: user_id, product_id, reordered


In [None]:
product_reorders = order_products_merged.groupby(['user_id', 'product_id'])['reordered'].sum().reset_index()
product_orders = order_products_merged.groupby(['user_id', 'product_id'])['order_id'].count().reset_index()
merged = product_reorders.merge(product_orders, on=['user_id', 'product_id'])
merged['reorder_ratio'] = merged['reordered'] / merged['order_id']

In [None]:
reorder_ratio = merged.copy()

In [None]:
reorder_ratio.head()

## Product Popularity in Each Aisle:
Calculate the total orders containing products in each aisle.

Source Features: product_id, aisle_id, order_id


In [None]:
aisle_popularity = order_products_merged.groupby('aisle_id')['order_id'].nunique().reset_index()
aisle_popularity.columns = ['aisle_id', 'aisle_popularity']

In [None]:
aisle_popularity.head()

In [None]:
import pandas as pd

# Merge order details to get days_since_prior_order for each user-product order
merged_orders = order_prods_prior_df.merge(orders_df[['order_id', 'user_id', 'order_number', 'days_since_prior_order']],
                                           on='order_id', how='left')

# Sort by user, product, and order number to get chronological order for each product per user
merged_orders = merged_orders.sort_values(by=['user_id', 'product_id', 'order_number'])

# Calculate days to the next purchase for each user-product combination
merged_orders['days_until_next_order'] = merged_orders.groupby(['user_id', 'product_id'])['days_since_prior_order'].shift(-1)

# Reset index for cleaner merging if needed
merged_orders.reset_index(drop=True, inplace=True)

# At this point, 'days_until_next_order' represents the next purchase time
# If a row has NaN for 'days_until_next_order', it indicates the last purchase for that user-product

# Keep only the relevant columns for the modeling DataFrame
next_purchase_time = merged_orders[['user_id', 'product_id', 'order_id', 'days_until_next_order']].copy()

# Rename 'days_until_next_order' to 'next_purchase_time' for clarity
next_purchase_time.rename(columns={'days_until_next_order': 'next_purchase_time'}, inplace=True)


In [None]:
next_purchase_time.head()

In [None]:
next_purchase_time.info()

## Combining Features for Modeling


### Merge User-Level Features:

In [None]:
# Assume user_features contains user-level features like 'avg_days_between_purchases', 'total_orders', etc.
user_features = avg_days_between.merge(total_orders, on='user_id', how='left')
user_features = user_features.merge(avg_items_per_order, on='user_id', how='left')
user_features = user_features.merge(most_freq_dow, on='user_id', how='left')
user_features = user_features.merge(most_freq_hour, on='user_id', how='left')


### Merge Product-Level Features:

In [None]:
# Assume product_features contains product-level features like 'avg_days_product_purchase', 'product_order_ratio', etc.
product_features = avg_days_product.merge(product_order_ratio, on=['user_id', 'product_id'], how='left')

In [None]:
product_features.head()

### Merge User-Product Interaction Features:

In [None]:
# Assume user_product_features contains user-product interaction features like 'recency_last_purchase', 'reorder_ratio', etc.
user_product_features = recency_last_purchase.merge(reorder_ratio, on=['user_id', 'product_id'], how='left')


In [None]:
user_product_features.head()

In [None]:
user_product_features.to_csv('user_product_features.csv', index=False)
user_features.to_csv('user_features.csv', index=False)
product_features.to_csv('product_features.csv', index=False)


# Predicting Product Demand Frequency

In [None]:
# Calculate average days since prior order for each product
avg_demand_interval = order_products_merged.groupby('product_id')['days_since_prior_order'].mean().reset_index()
avg_demand_interval.columns = ['product_id', 'average_demand_interval']
avg_demand_interval.head()


In [None]:
product_order_counts = order_products_merged.groupby('product_id').size().reset_index(name='total_orders')


In [None]:
avg_cart_position = order_products_merged.groupby('product_id')['add_to_cart_order'].mean().reset_index()
avg_cart_position.columns = ['product_id', 'average_cart_position']


In [None]:
product_reorder_counts = order_products_merged.groupby('product_id').agg(
    total_orders=('order_id', 'count'),
    reorders=('reordered', 'sum')
).reset_index()
product_reorder_counts['reorder_probability'] = product_reorder_counts['reorders'] / product_reorder_counts['total_orders']


In [None]:
unique_user_counts = order_products_merged.groupby('product_id')['user_id'].nunique().reset_index()
unique_user_counts.columns = ['product_id', 'unique_users']


In [None]:
recent_purchase = order_products_merged.groupby('product_id')['days_since_prior_order'].max().reset_index()
recent_purchase.columns = ['product_id', 'recency_last_purchase']


In [None]:
user_product_intervals = order_products_merged.groupby(['product_id', 'user_id'])['days_since_prior_order'].mean().reset_index()
avg_user_product_interval = user_product_intervals.groupby('product_id')['days_since_prior_order'].mean().reset_index()
avg_user_product_interval.columns = ['product_id', 'avg_user_order_interval']


In [None]:
# Start with the target variable DataFrame
modeling_df = avg_demand_interval

# Merge each feature into the modeling DataFrame
modeling_df = modeling_df.merge(product_order_counts, on='product_id', how='left')
modeling_df = modeling_df.merge(avg_cart_position, on='product_id', how='left')
modeling_df = modeling_df.merge(product_reorder_counts[['product_id', 'reorder_probability']], on='product_id', how='left')
modeling_df = modeling_df.merge(unique_user_counts, on='product_id', how='left')
modeling_df = modeling_df.merge(recent_purchase, on='product_id', how='left')
modeling_df = modeling_df.merge(avg_user_product_interval, on='product_id', how='left')

# Example of encoding categorical features if needed
# modeling_df = pd.get_dummies(modeling_df, columns=['aisle', 'department'])


In [None]:
modeling_df.info()

In [None]:
products_df.info()

In [None]:
modeling_df.isnull().sum()

In [None]:
# Step 6: Handle missing values
modeling_df.fillna(0, inplace=True)
modeling_df.dropna(subset=['average_demand_interval'], inplace=True)

# Verify final DataFrame
modeling_df.head()

In [None]:
modeling_df.isnull().sum()

In [None]:
modeling_df.describe()

In [2]:
modeling_df.to_csv('/content/sample_data/modeling_df.csv', index=False)

NameError: name 'modeling_df' is not defined

# Split the data

In [3]:
modeling_df = pd.read_csv('/content/sample_data/modeling_df.csv')

In [4]:
from sklearn.model_selection import train_test_split
features = ['total_orders', 'average_cart_position', 'reorder_probability',
                    'unique_users', 'recency_last_purchase', 'avg_user_order_interval']
target = 'average_demand_interval'

X_train, X_test, y_train, y_test = train_test_split(modeling_df[features], modeling_df[target], test_size=0.2, random_state=42)


In [5]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(39741, 6) (9936, 6) (39741,) (9936,)


# Scale the data

In [7]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = [ 'total_orders', 'average_cart_position', 'reorder_probability',
                    'unique_users', 'recency_last_purchase', 'avg_user_order_interval']

scaler = StandardScaler()  # Or choose another scaler as needed

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)



In [8]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
y_pred_linear = linear_model.predict(X_test_scaled)
print(y_pred_linear)

[12.43251884 10.1033668   6.39855304 ...  9.49046838 12.55811898
 10.25350696]


In [9]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)  # we can change our aplha, but dispite different values of alpha we are getting the same results
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
print(y_pred_ridge)

[12.4325047  10.10342754  6.39859466 ...  9.49054586 12.55808647
 10.25357024]


In [10]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print(y_pred_rf)

[12.41458626  9.91644057  6.73075517 ...  9.93590776 12.03413791
  9.96140595]


In [11]:
from sklearn.svm import SVR

svr_model = SVR(kernel='rbf')  # 'rbf' kernel for non-linear relationships
svr_model.fit(X_train_scaled, y_train)
y_pred_svr = svr_model.predict(X_test_scaled)
print(y_pred_svr)

[12.45193825 10.15166983  6.8171162  ...  9.66882448 12.24478698
 10.00594131]


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# a function that can print us the Model evaluations.
def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"--- {model_name} ---")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print(f"R-squared (R²): {r2:.3f}\n")

# function call for each of the model
evaluate_model(y_test, y_pred_linear, "Linear Regression")
evaluate_model(y_test, y_pred_ridge, "Ridge Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")
evaluate_model(y_test, y_pred_svr, "Support Vector Regressor")



--- Linear Regression ---
Mean Absolute Error (MAE): 0.691
Mean Squared Error (MSE): 1.065
Root Mean Squared Error (RMSE): 1.032
R-squared (R²): 0.866

--- Ridge Regression ---
Mean Absolute Error (MAE): 0.691
Mean Squared Error (MSE): 1.065
Root Mean Squared Error (RMSE): 1.032
R-squared (R²): 0.866

--- Random Forest Regressor ---
Mean Absolute Error (MAE): 0.657
Mean Squared Error (MSE): 1.023
Root Mean Squared Error (RMSE): 1.012
R-squared (R²): 0.872

--- Support Vector Regressor ---
Mean Absolute Error (MAE): 0.640
Mean Squared Error (MSE): 1.034
Root Mean Squared Error (RMSE): 1.017
R-squared (R²): 0.870



# Hyperparameter Tuning

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to tune. Removed 'normalize'
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Keep other parameters
}

# Set up the LinearRegression model
linear_model = LinearRegression()

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=linear_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (MSE):", -grid_search.best_score_)

# Make predictions using the best model from GridSearchCV
best_linear_model = grid_search.best_estimator_
y_pred_linear_ht = best_linear_model.predict(X_test_scaled)

# Print predictions
print("Predictions:", y_pred_linear_ht)

Best Parameters: {'fit_intercept': True, 'positive': False}
Best Score (MSE): 1.0653793417009676
Predictions: [12.43251884 10.1033668   6.39855304 ...  9.49046838 12.55811898
 10.25350696]


In [14]:

from sklearn.decomposition import PCA


# Initialize PCA with desired number of components
pca = PCA(n_components=5)

# Fit PCA on training data and transform
X_train_pca = pca.fit_transform(X_train_scaled)

# Transform test data using fitted PCA
X_test_pca = pca.transform(X_test_scaled)

# Initialize Ridge model
ridge_model = Ridge(alpha=0.01)  # Remove pca_n_components

# Fit Ridge model on PCA-transformed data
ridge_model.fit(X_train_pca, y_train)

# Predict on PCA-transformed test data
y_pred_ridge_ht= ridge_model.predict(X_test_pca)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Set up the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (MSE):", -grid_search.best_score_)

# Make predictions using the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_
y_pred_rf_ht = best_rf_model.predict(X_test_scaled)

# Print predictions
print("Predictions:", y_pred_rf_ht)


In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'epsilon': [0.01, 0.1, 0.2, 0.5],  # Epsilon parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Type of kernel to use
    'degree': [2, 3, 4],  # Degree of the polynomial kernel function (only for 'poly' kernel)
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

# Set up the SVR model with RBF kernel
svr_model = SVR()

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (MSE):", -grid_search.best_score_)

# Make predictions using the best model from GridSearchCV
best_svr_model = grid_search.best_estimator_
y_pred_svr_ht = best_svr_model.predict(X_test_scaled)

# Print predictions
print("Predictions:", y_pred_svr_ht)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# a function that can print us the Model evaluations.
def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"--- {model_name} ---")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
    print(f"R-squared (R²): {r2:.3f}\n")

# function call for each of the model
evaluate_model(y_test, y_pred_linear_ht, "Linear Regression")
evaluate_model(y_test, y_pred_ridge_ht, "Ridge Regression")
evaluate_model(y_test, y_pred_rf_ht, "Random Forest Regressor")
evaluate_model(y_test, y_pred_svr_ht, "Support Vector Regressor")

