In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import lightgbm as lgb

# --- Load Data ---
orders_df = pd.read_csv("/content/drive/MyDrive/GC-Tech/Raw data/order_data_last_six_month.xlsx - Worksheet.csv")
items_df = pd.read_csv("/content/drive/MyDrive/GC-Tech/Raw data/associated_order_item_data_last_six_month.xlsx - Worksheet.csv")

# --- Preprocessing ---
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], dayfirst=True)

# Base customer-date grid
customer_orders = orders_df[['customer_id', 'order_date']].copy()
customer_orders['order_placed'] = 1

min_date = orders_df['order_date'].min()
max_date = orders_df['order_date'].max()
date_range = pd.date_range(min_date, max_date)
customers = customer_orders['customer_id'].unique()

grid = pd.MultiIndex.from_product([customers, date_range], names=["customer_id", "order_date"])
full_df = pd.DataFrame(index=grid).reset_index()
full_df = full_df.merge(customer_orders, on=["customer_id", "order_date"], how="left")
full_df['order_placed'] = full_df['order_placed'].fillna(0).astype(int)

# --- Rolling Features ---
full_df = full_df.sort_values(by=['customer_id', 'order_date'])

# Compute days since last order safely and correctly
last_order_tracker = full_df.groupby('customer_id')['order_placed'].transform(
    lambda x: x.ne(0).cumsum().where(x == 1)
)
full_df['days_since_last_order'] = (
    last_order_tracker.groupby(full_df['customer_id']).ffill().groupby(full_df['customer_id']).cumcount()
)
full_df['days_since_last_order'] = full_df['days_since_last_order'].fillna(999)


full_df['orders_past_7d'] = full_df.groupby('customer_id')['order_placed'].transform(lambda x: x.rolling(7).sum())
full_df['orders_past_14d'] = full_df.groupby('customer_id')['order_placed'].transform(lambda x: x.rolling(14).sum())
full_df.fillna({'orders_past_7d': 0, 'orders_past_14d': 0}, inplace=True)

# --- Date Features ---
full_df['day_of_week'] = full_df['order_date'].dt.dayofweek
full_df['is_weekend'] = full_df['day_of_week'].isin([5, 6]).astype(int)
full_df['month'] = full_df['order_date'].dt.month
full_df['day'] = full_df['order_date'].dt.day

# --- Cumulative Behavior ---
full_df['cumulative_orders'] = full_df.groupby('customer_id')['order_placed'].cumsum()
order_counts = full_df.groupby('customer_id')['order_placed'].sum()
active_days = full_df.groupby('customer_id').size()
full_df['avg_order_frequency'] = full_df['customer_id'].map((order_counts / active_days).to_dict())

# --- Merge Items with Orders ---
orders_items_merged = items_df.merge(orders_df[['order_id', 'customer_id', 'order_date']], on='order_id', how='left')
orders_items_merged['order_date'] = pd.to_datetime(orders_items_merged['order_date'], dayfirst=True)

# Most common item per customer
most_common_item = (
    orders_items_merged.groupby(['customer_id', 'item_name'])
    .size()
    .reset_index(name='count')
    .sort_values(['customer_id', 'count'], ascending=[True, False])
    .drop_duplicates('customer_id')
    .set_index('customer_id')['item_name']
    .to_dict()
)
full_df['top_item'] = full_df['customer_id'].map(most_common_item)
le = LabelEncoder()
full_df['top_item_encoded'] = le.fit_transform(full_df['top_item'].fillna('Unknown'))

# Average days between orders
order_dates = orders_df.sort_values(by='order_date')[['customer_id', 'order_date']]
avg_days_between = (
    order_dates.groupby('customer_id')['order_date']
    .apply(lambda x: x.diff().dt.days.dropna().mean())
    .to_dict()
)
full_df['avg_days_between_orders'] = full_df['customer_id'].map(avg_days_between)

# Distinct items ordered
distinct_items = (
    orders_items_merged.groupby('customer_id')['item_name']
    .nunique()
    .to_dict()
)
full_df['distinct_items_ordered'] = full_df['customer_id'].map(distinct_items)

# Fill final NaNs
full_df.fillna({
    'avg_days_between_orders': 999,
    'distinct_items_ordered': 0
}, inplace=True)

# --- Prepare for Modeling ---
feature_cols = [
    'days_since_last_order', 'orders_past_7d', 'orders_past_14d',
    'day_of_week', 'is_weekend', 'month', 'day',
    'cumulative_orders', 'avg_order_frequency',
    'top_item_encoded', 'avg_days_between_orders',
    'distinct_items_ordered'
]

X = full_df[feature_cols]
y = full_df['order_placed']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# --- Train Model LightGBM Classifier ---
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

# --- Evaluate ---
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Optional: Predict probabilities for next 14 days (you can extend this as needed)


[LightGBM] [Info] Number of positive: 48344, number of negative: 557816
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1260
[LightGBM] [Info] Number of data points in the train set: 606160, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079755 -> initscore=-2.445687
[LightGBM] [Info] Start training from score -2.445687
              precision    recall  f1-score   support

           0       0.94      0.99      0.96    185939
           1       0.70      0.28      0.40     16115

    accuracy                           0.93    202054
   macro avg       0.82      0.63      0.68    202054
weighted avg       0.92      0.93      0.92    202054



**Saves the Model Params**

In [5]:
import joblib

# Save the Model
joblib.dump(clf, '/content/drive/MyDrive/GC-Tech/Models/Order_identification.pkl')

['/content/drive/MyDrive/GC-Tech/Models/Order_identification.pkl']

**Report for various Classification Models**

Gives classification report for various classification model.



1.   Random Forest
2.   Logistic Regression
3.   KNN
4.   XGBoost
5.   LightGBM  

The Best one among these 5 is ***LightGBM*** which gives an accuracy of 0.933. So, we are using LightGBM.



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import lightgbm as lgb

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(max_iter=10000, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier()
}

for name, model in models.items():
    print(f"\n🔍 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"✅ Results for {name}:")
    print(classification_report(y_test, y_pred, digits=3))
    print("-" * 60)


🔍 Training Random Forest...
✅ Results for Random Forest:
              precision    recall  f1-score   support

           0      0.938     0.984     0.961    185939
           1      0.580     0.248     0.347     16115

    accuracy                          0.926    202054
   macro avg      0.759     0.616     0.654    202054
weighted avg      0.909     0.926     0.912    202054

------------------------------------------------------------

🔍 Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Results for Logistic Regression:
              precision    recall  f1-score   support

           0      0.975     0.875     0.923    185939
           1      0.341     0.744     0.468     16115

    accuracy                          0.865    202054
   macro avg      0.658     0.810     0.695    202054
weighted avg      0.925     0.865     0.886    202054

------------------------------------------------------------

🔍 Training KNN...
✅ Results for KNN:
              precision    recall  f1-score   support

           0      0.936     0.980     0.957    185939
           1      0.492     0.221     0.305     16115

    accuracy                          0.920    202054
   macro avg      0.714     0.601     0.631    202054
weighted avg      0.900     0.920     0.905    202054

------------------------------------------------------------

🔍 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



✅ Results for XGBoost:
              precision    recall  f1-score   support

           0      0.941     0.988     0.964    185939
           1      0.681     0.291     0.408     16115

    accuracy                          0.933    202054
   macro avg      0.811     0.640     0.686    202054
weighted avg      0.921     0.933     0.920    202054

------------------------------------------------------------

🔍 Training LightGBM...
[LightGBM] [Info] Number of positive: 48344, number of negative: 557816
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1260
[LightGBM] [Info] Number of data points in the train set: 606160, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.079755 -> initscore=-2.445687
[LightGBM] [Info] Start training from score -2.445687
✅ Res

**Generates Probability for next 14 Days**

Gives a CSV file including each customer_id and their respective probabilities of ordering the products in next 14 days.

This CSV is saved as "order_probability_next_14_days.csv"



In [3]:
# Get latest known features per customer
latest_features = full_df.sort_values("order_date").groupby("customer_id").tail(1).set_index("customer_id")

# Generate next 14 days
from datetime import timedelta
future_dates = pd.date_range(full_df['order_date'].max() + timedelta(days=1), periods=14)
customers = latest_features.index.unique()

# Step 3: Create customer × date grid
future_grid = pd.MultiIndex.from_product([customers, future_dates], names=["customer_id", "order_date"]).to_frame(index=False)

# Step 4: Copy static features from latest record
for feature in [
    'days_since_last_order', 'orders_past_7d', 'orders_past_14d',
    'cumulative_orders', 'avg_order_frequency',
    'top_item_encoded', 'avg_days_between_orders',
    'distinct_items_ordered'
]:
    future_grid[feature] = future_grid['customer_id'].map(latest_features[feature])

# Step 5: Add time-based features based on order_date
future_grid['day_of_week'] = future_grid['order_date'].dt.dayofweek
future_grid['is_weekend'] = future_grid['day_of_week'].isin([5, 6]).astype(int)
future_grid['month'] = future_grid['order_date'].dt.month
future_grid['day'] = future_grid['order_date'].dt.day

# Step 6: Predict
X_future = future_grid[feature_cols]  # same features used for training
future_grid['order_probability'] = clf.predict_proba(X_future)[:, 1]

# Step 7: Save only desired output
final_output = future_grid[['customer_id', 'order_date', 'order_probability']]
final_output.to_csv("order_probability_next_14_days.csv", index=False)

# Display sample
final_output.head(10)


Unnamed: 0,customer_id,order_date,order_probability
0,6065,2025-04-11,0.081989
1,6065,2025-04-12,0.055436
2,6065,2025-04-13,0.085386
3,6065,2025-04-14,0.10904
4,6065,2025-04-15,0.126007
5,6065,2025-04-16,0.243663
6,6065,2025-04-17,0.215187
7,6065,2025-04-18,0.211836
8,6065,2025-04-19,0.178101
9,6065,2025-04-20,0.152142
