# 📊 Restaurant Data Warehouse + ML Pipeline (Colab Ready)

This notebook lets you:
1. Upload your restaurant orders dataset (CSV).
2. Build features + label (return in 30 days).
3. Train Decision Tree & Naive Bayes.
4. Evaluate with precision, recall, F1, accuracy, ROC-AUC.

---

### ✅ Instructions
- Prepare a CSV with columns:
  `order_id, customer_id, timestamp, total_amount, num_items, status, channel`
- Upload it when prompted below.


In [None]:
# Install required packages (if missing)
!pip install pandas numpy scikit-learn joblib matplotlib sqlalchemy

In [None]:
# 📂 Upload your CSV
from google.colab import files
import pandas as pd

uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df_orders = pd.read_csv(file_name, parse_dates=['timestamp'])
print('Rows loaded:', len(df_orders))
df_orders.head()

In [None]:
# 🔧 Feature Engineering + Label Creation
import numpy as np

orders = df_orders.copy()
orders = orders.sort_values(['customer_id','timestamp']).reset_index(drop=True)

# Label
comp = orders[orders['status']=='completed'].sort_values(['customer_id','timestamp']).reset_index(drop=True)
comp['next_ts'] = comp.groupby('customer_id')['timestamp'].shift(-1)
comp['days_to_next'] = (comp['next_ts'] - comp['timestamp']).dt.days
comp['return_30d'] = ((comp['days_to_next'].notna()) & (comp['days_to_next'] <= 30)).astype(int)
label_map = comp.set_index('order_id')['return_30d'].to_dict()
orders['return_30d'] = orders['order_id'].map(label_map).fillna(0).astype(int)

# Features
orders['prev_ts'] = orders.groupby('customer_id')['timestamp'].shift(1)
orders['recency_days'] = (orders['timestamp'] - orders['prev_ts']).dt.days.fillna(999).astype(int)
orders['avg_ticket'] = orders['total_amount'] / orders['num_items'].replace(0,1)
orders['order_hour'] = orders['timestamp'].dt.hour
orders['is_weekend'] = orders['timestamp'].dt.dayofweek.isin([5,6]).astype(int)

# 90-day frequency
freq90 = []
for idx,row in orders.iterrows():
    cust = row['customer_id']; ts=row['timestamp']
    prior = orders[(orders['customer_id']==cust) & (orders['timestamp']<ts) & (orders['timestamp']>=ts-pd.Timedelta(days=90)) & (orders['status']=='completed')]
    freq90.append(len(prior))
orders['freq_90d'] = freq90

# Modeling dataset
model_df = orders[orders['status']=='completed'].copy()
print('Model dataset rows:', len(model_df))
model_df.head()

In [None]:
# 🤖 Train & Evaluate Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import joblib, os

# Train/test split
split_date = pd.to_datetime('2024-07-01')
train = model_df[model_df['timestamp'] < split_date].copy()
test = model_df[model_df['timestamp'] >= split_date].copy()

feature_cols = ['total_amount','num_items','recency_days','avg_ticket','order_hour','is_weekend','freq_90d']
train = pd.get_dummies(train, columns=['channel'], drop_first=True)
test = pd.get_dummies(test, columns=['channel'], drop_first=True)
for c in ['channel_takeaway','channel_delivery']:
    if c not in train.columns: train[c]=0
    if c not in test.columns: test[c]=0
feature_cols += ['channel_takeaway','channel_delivery']

X_train, y_train = train[feature_cols].fillna(0), train['return_30d']
X_test, y_test = test[feature_cols].fillna(0), test['return_30d']

# Models
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5, random_state=42)
nb = GaussianNB()

dt.fit(X_train, y_train)
nb.fit(X_train, y_train)

# Predictions
y_dt, y_nb = dt.predict(X_test), nb.predict(X_test)
p_dt, p_nb = dt.predict_proba(X_test)[:,1], nb.predict_proba(X_test)[:,1]

def metrics(y_true,y_pred,y_proba):
    return dict(
        accuracy=round(accuracy_score(y_true,y_pred),3),
        precision=round(precision_score(y_true,y_pred,zero_division=0),3),
        recall=round(recall_score(y_true,y_pred,zero_division=0),3),
        f1=round(f1_score(y_true,y_pred,zero_division=0),3),
        roc_auc=round(roc_auc_score(y_true,y_proba) if len(set(y_true))>1 else float('nan'),3)
    )

print('Decision Tree:', metrics(y_test,y_dt,p_dt))
print('Naive Bayes:', metrics(y_test,y_nb,p_nb))

print('\nClassification Report (Decision Tree):\n', classification_report(y_test,y_dt,zero_division=0))
print('\nClassification Report (Naive Bayes):\n', classification_report(y_test,y_nb,zero_division=0))

# Save artifacts
os.makedirs('outputs', exist_ok=True)
joblib.dump(dt, 'outputs/decision_tree.joblib')
joblib.dump(nb, 'outputs/naive_bayes.joblib')
model_df.to_csv('outputs/model_dataset.csv', index=False)
print('Artifacts saved in outputs/')

## ⚡ Data Mining Scalability Analysis

Now we compare **Decision Tree vs. Naive Bayes** in terms of:
- ⏱️ Training Time
- 📦 Memory Usage
- 🎯 Accuracy vs. Time trade-off

This helps evaluate which model scales better for larger restaurant datasets.

In [None]:
# ⏱️ Runtime Comparison
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

X = model_df[feature_cols].fillna(0).values
y = model_df['return_30d'].values

# Decision Tree timing
start = time.time()
dt_test = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5)
dt_test.fit(X, y)
dt_time = time.time() - start

# Naive Bayes timing
start = time.time()
nb_test = GaussianNB()
nb_test.fit(X, y)
nb_time = time.time() - start

print(f"Decision Tree Training Time: {dt_time:.4f} sec")
print(f"Naive Bayes Training Time: {nb_time:.4f} sec")

In [None]:
# 📦 Memory Usage Comparison
!pip install -q memory_profiler
from memory_profiler import memory_usage

def train_dt():
    model = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5)
    model.fit(X, y)
    return model

def train_nb():
    model = GaussianNB()
    model.fit(X, y)
    return model

dt_mem = max(memory_usage((train_dt,)))
nb_mem = max(memory_usage((train_nb,)))

print(f"Decision Tree Peak Memory: {dt_mem:.2f} MB")
print(f"Naive Bayes Peak Memory: {nb_mem:.2f} MB")

In [None]:
# 📊 Plot Accuracy vs. Time vs. Memory
import matplotlib.pyplot as plt

models = ["Decision Tree", "Naive Bayes"]
accuracy = [dt_acc, nb_acc]
time_taken = [dt_time, nb_time]
memory_used = [dt_mem, nb_mem]

fig, axes = plt.subplots(1,3, figsize=(15,4))

axes[0].bar(models, accuracy, color=['skyblue','salmon'])
axes[0].set_title("Accuracy")

axes[1].bar(models, time_taken, color=['skyblue','salmon'])
axes[1].set_title("Training Time (s)")

axes[2].bar(models, memory_used, color=['skyblue','salmon'])
axes[2].set_title("Peak Memory (MB)")

plt.suptitle("⚡ Scalability: Decision Tree vs Naive Bayes")
plt.show()