## 1. Project Header & Path Setup
In the first cell, add the logic to link your code folder so you can access your time_operation decorator.

In [1]:
import sys
import pandas as pd
from pathlib import Path

# Identify project structure
project_root = Path.cwd().parent
code_path = project_root / "code"
results_dir = project_root / "results"

# Add code folder to path to import utils
if str(code_path) not in sys.path:
    sys.path.append(str(code_path))

from utils import time_operation

## 2. Loading the Baskets
Use a timed function to load your DataFrames. This helps document the I/O overhead in your report.

In [3]:
@time_operation
def load_checkpoints():
    df_user = pd.read_pickle(results_dir / "user_baskets.pkl")
    df_session = pd.read_pickle(results_dir / "session_baskets.pkl")
    return df_user, df_session

(df_user, df_session), load_time = load_checkpoints()
print(f"Loaded DataFrames in {load_time:.2f} ms")


Loaded DataFrames in 284.18 ms


In [4]:
df_user.info()
df_session.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  2352 non-null   int32 
 1   basket   2352 non-null   object
dtypes: int32(1), object(1)
memory usage: 27.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20772 entries, 0 to 20771
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   user_id  20772 non-null  int32         
 1   date     20772 non-null  datetime64[us]
 2   basket   20772 non-null  object        
dtypes: datetime64[us](1), int32(1), object(1)
memory usage: 405.8+ KB


In [None]:
df_user.sample(10)
df_session.sample(10)

## 3. Transaction Encoding (One-Hot Encoding)
The mlxtend algorithms require a boolean matrix. We will wrap this in your timing decorator to show the cost of data transformation.

In [5]:
from mlxtend.preprocessing import TransactionEncoder

@time_operation
def encode_data(baskets):
    te = TransactionEncoder()
    te_ary = te.fit(baskets).transform(baskets)
    return pd.DataFrame(te_ary, columns=te.columns_)

# Encode both levels
user_encoded, u_enc_time = encode_data(df_user['basket'])
session_encoded, s_enc_time = encode_data(df_session['basket'])

print(f"User matrix shape: {user_encoded.shape} (Encoded in {u_enc_time:.2f} ms)")
print(f"Session matrix shape: {session_encoded.shape} (Encoded in {s_enc_time:.2f} ms)")

User matrix shape: (2352, 112) (Encoded in 46.50 ms)
Session matrix shape: (20772, 112) (Encoded in 240.97 ms)


## 4. Apriori Parameter Fine-Tuning
We will iterate through different support and confidence thresholds. The goal is to find the **elbow point**â€”the settings that provide enough rules to be insightful without generating thousands of redundant ones.

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
from utils import time_operation

# Define the tuning grid
support_levels = [0.05, 0.03, 0.02, 0.01]
confidence_levels = [0.5, 0.6, 0.7]
tuning_results = []

print("--- Starting Apriori Fine-Tuning (Session Level) ---")

for supp in support_levels:
    # Measure Frequent Itemset generation time
    itemsets, duration = time_operation(apriori)(session_encoded, min_support=supp, use_colnames=True)
    
    for conf in confidence_levels:
        rules = association_rules(itemsets, metric="confidence", min_threshold=conf)
        
        tuning_results.append({
            "Algorithm": "Apriori",
            "Support": supp,
            "Confidence": conf,
            "Time_ms": duration,
            "Rule_Count": len(rules),
            "Avg_Lift": rules['lift'].mean() if not rules.empty else 0
        })

# Save tuning log for the report table
df_apriori_log = pd.DataFrame(tuning_results)
df_apriori_log.to_csv(results_dir / "apriori_tuning_log.csv", index=False)
df_apriori_log