# 📊 Restaurant Data Warehouse + ML Pipeline + Scalability (Colab Ready)

This notebook lets you:
1. Build features + label (return in 30 days).
2. Train Decision Tree & Naive Bayes.
3. Evaluate with precision, recall, F1, accuracy, ROC-AUC.
4. Analyze **Data Mining Scalability**:
   - Runtime vs dataset size
   - Runtime vs number of features

---
✅ All required packages are already pre-installed in Colab: `pandas`, `numpy`, `scikit-learn`, `joblib`, `matplotlib`, `sqlalchemy`

In [None]:
# 📂 Load dataset directly from GitHub
import pandas as pd

url = "https://raw.githubusercontent.com/avinash972/restaurant-dw-ml/main/restaurant_orders.csv"
df_orders = pd.read_csv(url, parse_dates=['timestamp'])
print("Rows loaded:", len(df_orders))
df_orders.head()

In [None]:
# 🔧 Feature Engineering + Label Creation
import numpy as np

orders = df_orders.copy()
orders = orders.sort_values(['customer_id','timestamp']).reset_index(drop=True)

# Label: return within 30 days
comp = orders[orders['status']=='completed'].sort_values(['customer_id','timestamp']).reset_index(drop=True)
comp['next_ts'] = comp.groupby('customer_id')['timestamp'].shift(-1)
comp['days_to_next'] = (comp['next_ts'] - comp['timestamp']).dt.days
comp['return_30d'] = ((comp['days_to_next'].notna()) & (comp['days_to_next'] <= 30)).astype(int)
label_map = comp.set_index('order_id')['return_30d'].to_dict()
orders['return_30d'] = orders['order_id'].map(label_map).fillna(0).astype(int)

# Features
orders['prev_ts'] = orders.groupby('customer_id')['timestamp'].shift(1)
orders['recency_days'] = (orders['timestamp'] - orders['prev_ts']).dt.days.fillna(999).astype(int)
orders['avg_ticket'] = orders['total_amount'] / orders['num_items'].replace(0,1)
orders['order_hour'] = orders['timestamp'].dt.hour
orders['is_weekend'] = orders['timestamp'].dt.dayofweek.isin([5,6]).astype(int)

# 90-day frequency
freq90 = []
for idx,row in orders.iterrows():
    cust = row['customer_id']; ts=row['timestamp']
    prior = orders[(orders['customer_id']==cust) & (orders['timestamp']<ts) & (orders['timestamp']>=ts-pd.Timedelta(days=90)) & (orders['status']=='completed')]
    freq90.append(len(prior))
orders['freq_90d'] = freq90

# Final modeling dataset
model_df = orders[orders['status']=='completed'].copy()
print('Model dataset rows:', len(model_df))
model_df.head()

In [None]:
# 🤖 Train & Evaluate Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import joblib, os

split_date = pd.to_datetime('2024-07-01')
train = model_df[model_df['timestamp'] < split_date].copy()
test = model_df[model_df['timestamp'] >= split_date].copy()

feature_cols = ['total_amount','num_items','recency_days','avg_ticket','order_hour','is_weekend','freq_90d']
train = pd.get_dummies(train, columns=['channel'], drop_first=True)
test = pd.get_dummies(test, columns=['channel'], drop_first=True)
for c in ['channel_takeaway','channel_delivery']:
    if c not in train.columns: train[c]=0
    if c not in test.columns: test[c]=0
feature_cols += ['channel_takeaway','channel_delivery']

X_train, y_train = train[feature_cols].fillna(0), train['return_30d']
X_test, y_test = test[feature_cols].fillna(0), test['return_30d']

dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=5, random_state=42)
nb = GaussianNB()

dt.fit(X_train, y_train)
nb.fit(X_train, y_train)

y_dt, y_nb = dt.predict(X_test), nb.predict(X_test)
p_dt, p_nb = dt.predict_proba(X_test)[:,1], nb.predict_proba(X_test)[:,1]

def metrics(y_true,y_pred,y_proba):
    return dict(
        accuracy=round(accuracy_score(y_true,y_pred),3),
        precision=round(precision_score(y_true,y_pred,zero_division=0),3),
        recall=round(recall_score(y_true,y_pred,zero_division=0),3),
        f1=round(f1_score(y_true,y_pred,zero_division=0),3),
        roc_auc=round(roc_auc_score(y_true,y_proba) if len(set(y_true))>1 else float('nan'),3)
    )

print('Decision Tree:', metrics(y_test,y_dt,p_dt))
print('Naive Bayes:', metrics(y_test,y_nb,p_nb))

print('\nClassification Report (Decision Tree):\n', classification_report(y_test,y_dt,zero_division=0))
print('\nClassification Report (Naive Bayes):\n', classification_report(y_test,y_nb,zero_division=0))

os.makedirs('outputs', exist_ok=True)
joblib.dump(dt, 'outputs/decision_tree.joblib')
joblib.dump(nb, 'outputs/naive_bayes.joblib')
model_df.to_csv('outputs/model_dataset.csv', index=False)
print('Artifacts saved in outputs/')

## ⚡ Data Mining Scalability

We now test how models behave as:
1. **Dataset size increases** (runtime).
2. **Number of features increases** (runtime).

In [None]:
import time, matplotlib.pyplot as plt

sizes = [1000, 2000, 4000, 8000, len(X_train)]
dt_times, nb_times = [], []

for s in sizes:
    Xs, ys = X_train[:s], y_train[:s]
    
    t0 = time.time(); DecisionTreeClassifier(max_depth=6).fit(Xs, ys); dt_times.append(time.time()-t0)
    t0 = time.time(); GaussianNB().fit(Xs, ys); nb_times.append(time.time()-t0)

plt.plot(sizes, dt_times, 'o-', label='Decision Tree')
plt.plot(sizes, nb_times, 'o-', label='Naive Bayes')
plt.xlabel('Training Set Size (rows)'); plt.ylabel('Training Time (s)'); plt.title('Scalability: Rows vs Time')
plt.legend(); plt.show()

# Feature scalability: duplicate columns artificially
feat_counts = [len(feature_cols), len(feature_cols)*2, len(feature_cols)*4]
dt_feat_times, nb_feat_times = [], []

for f in feat_counts:
    Xf = pd.concat([X_train]*int(f/len(feature_cols)), axis=1)
    
    t0 = time.time(); DecisionTreeClassifier(max_depth=6).fit(Xf, y_train); dt_feat_times.append(time.time()-t0)
    t0 = time.time(); GaussianNB().fit(Xf, y_train); nb_feat_times.append(time.time()-t0)

plt.plot(feat_counts, dt_feat_times, 'o-', label='Decision Tree')
plt.plot(feat_counts, nb_feat_times, 'o-', label='Naive Bayes')
plt.xlabel('Number of Features'); plt.ylabel('Training Time (s)'); plt.title('Scalability: Features vs Time')
plt.legend(); plt.show()