## 📘 Tip Prediction Project

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, log_loss


### Load Data

In [None]:
orders = pd.read_parquet("orders.parquet")
order_products = pd.read_csv("order_products.csv")
products = pd.read_csv("products.csv")
departments = pd.read_csv("departments.csv")
aisles = pd.read_csv("aisles.csv")
tips = pd.read_csv("tips.csv")


### Merge and Aggregate

In [None]:
merged = (
    order_products
    .merge(products, on="product_id", how="left")
    .merge(orders, on="order_id", how="left")
    .merge(tips, on="order_id", how="left")
    .merge(aisles, on="aisle_id", how="left")
    .merge(departments, on="department_id", how="left")
)

order_features = merged.groupby("order_id").agg(
    total_items=("product_id", "count"),
    unique_departments=("department", "nunique"),
    unique_aisles=("aisle", "nunique"),
    has_dairy=("department", lambda x: "dairy eggs" in x.str.lower().values),
    order_hour=("order_date", lambda x: pd.to_datetime(x).dt.hour.iloc[0]),
    order_weekday=("order_date", lambda x: pd.to_datetime(x).dt.dayofweek.iloc[0])
).reset_index()

df = order_features.merge(tips, on="order_id", how="left")
df["label"] = df["tip"].notna().astype(int)
df["probability"] = df["label"]


### Train-Test Split

In [None]:
features = ["total_items", "unique_departments", "unique_aisles", "has_dairy", "order_hour", "order_weekday"]
X = df[features].copy()
X["has_dairy"] = X["has_dairy"].astype(int)
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Model Training

In [None]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


### Output Submission Files

In [None]:
submission_A = df[["order_id"]].copy()
submission_A["tip"] = clf.predict(X)
submission_A.to_csv("Team_X_submission_A.csv", index=False)

submission_B = df[["order_id"]].copy()
submission_B["tip"] = clf.predict_proba(X)[:, 1]
submission_B.to_csv("Team_X_submission_B.csv", index=False)


### Feature Importance

In [None]:
importances = pd.Series(clf.feature_importances_, index=features)
importances.sort_values().plot(kind="barh", title="Feature Importance")
plt.tight_layout()
plt.show()
