<a href="https://colab.research.google.com/github/boiBASH/Tolaram_Project/blob/main/ML_Model_for_Customer_Profilling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Load and clean data
df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})
# Updated datetime parsing with dayfirst=True
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)
df['spend'] = df['spend'].str.replace(',', '').astype(float)
df = df[df['delivered_qty'] > 0].sort_values(['Customer_Phone', 'SKU_Code', 'delivered_date'])

# Feature engineering
df['prev_date'] = df.groupby(['Customer_Phone', 'SKU_Code'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq'] = df.groupby(['Customer_Phone', 'SKU_Code']).cumcount()
df['cum_spend'] = df.groupby(['Customer_Phone', 'SKU_Code'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0, 1)
df['cum_qty'] = df.groupby(['Customer_Phone', 'SKU_Code'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past'] = df['cum_qty'] / df['cum_freq'].replace(0, 1)

# Compute targets
df['next_date'] = df.groupby(['Customer_Phone', 'SKU_Code'])['delivered_date'].shift(-1)
df['next_days'] = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend'] = df.groupby(['Customer_Phone', 'SKU_Code'])['spend'].shift(-1)
df['next_qty'] = df.groupby(['Customer_Phone', 'SKU_Code'])['delivered_qty'].shift(-1)
df['y_buy'] = df['next_days'].notnull().astype(int)

# Encode SKU
le_sku = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le_sku.transform(df['SKU_Code'])

# Define features
features = ['recency_days', 'cum_freq', 'avg_spend_past', 'avg_qty_past', 'sku_enc']

# Train classifier
X_clf = df[features]
y_clf = df['y_buy']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=20, random_state=42).fit(Xc_train, yc_train)

# Train regressors on positive instances
df_reg = df[df['y_buy'] == 1].dropna(subset=['next_days', 'next_spend', 'next_qty'])
X_reg = df_reg[features]
y_days = df_reg['next_days']
y_spend = df_reg['next_spend']
y_qty = df_reg['next_qty']

reg_days = RandomForestRegressor(n_estimators=20, random_state=42).fit(X_reg, y_days)
reg_spend = RandomForestRegressor(n_estimators=20, random_state=42).fit(X_reg, y_spend)
reg_qty = RandomForestRegressor(n_estimators=20, random_state=42).fit(X_reg, y_qty)

# Predict on latest snapshot
latest = df.groupby(['Customer_Phone', 'SKU_Code']).last().reset_index()
latest['last_purchase_date'] = latest['delivered_date']
latest['recency_days'] = (pd.Timestamp.now() - latest['delivered_date']).dt.days
latest['sku_enc'] = le_sku.transform(latest['SKU_Code'])
latest['cum_freq'] = latest['cum_freq']
latest['avg_spend_past'] = latest['avg_spend_past']
latest['avg_qty_past'] = latest['avg_qty_past']

X_pred = latest[features]
latest['probability'] = clf.predict_proba(X_pred)[:, 1]
latest['pred_days'] = reg_days.predict(X_pred)
latest['pred_next_date'] = latest['last_purchase_date'] + pd.to_timedelta(latest['pred_days'], unit='D')
latest['pred_spend'] = reg_spend.predict(X_pred)
latest['pred_qty'] = reg_qty.predict(X_pred)

# Extract top-3 SKUs per customer based on true probabilities
results = (latest.sort_values(['Customer_Phone', 'probability'], ascending=[True, False])
                  .groupby('Customer_Phone').head(3).reset_index(drop=True))

# Display interactive results
pred_df = results[['Customer_Phone', 'SKU_Code', 'last_purchase_date',
                   'pred_next_date', 'pred_spend', 'pred_qty', 'probability']]
pred_df

  df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')


Unnamed: 0,Customer_Phone,SKU_Code,last_purchase_date,pred_next_date,pred_spend,pred_qty,probability
0,7010009941,10002871,2024-11-28,2024-12-14 14:43:34.718568473,20050.000000,1.023395,0.100000
1,7010009941,10002849,2024-10-03,2024-10-12 07:12:00.000000000,20607.500000,1.000000,0.000000
2,7010009941,10002857,2025-01-02,2025-01-23 22:48:00.000000000,24190.000000,1.100000,0.000000
3,7010147753,10002957,2024-11-28,2024-12-13 16:37:41.149746635,183629.656963,1.617399,0.304423
4,7010147753,10002835,2024-09-09,2024-10-08 04:18:16.049154156,23242.500000,1.159429,0.251098
...,...,...,...,...,...,...,...
11382,9167401267,10003376,2025-01-07,2025-01-31 03:24:24.302564821,19850.000000,1.122391,0.273278
11383,9167401267,10002861,2024-10-14,2024-10-30 01:06:11.361857771,20000.000000,1.020956,0.173211
11384,9169810564,10000533,2024-11-21,2024-12-20 14:35:09.733016040,90317.778957,5.902993,0.428003
11385,9169810564,10002844,2024-12-19,2025-01-09 21:47:19.415802937,24130.000000,1.310417,0.374679


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)

# 1) Load & clean
df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)
df['spend'] = df['spend'].str.replace(',', '').astype(float)
df = df[df['delivered_qty'] > 0] \
       .sort_values(['Customer_Phone','SKU_Code','delivered_date'])

# 2) Features & targets
df['prev_date'] = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq']      = df.groupby(['Customer_Phone','SKU_Code']).cumcount()
df['cum_spend']     = df.groupby(['Customer_Phone','SKU_Code'])['spend'].cumsum() - df['spend']
df['avg_spend_past']= df['cum_spend'] / df['cum_freq'].replace(0,1)
df['cum_qty']       = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past']  = df['cum_qty'] / df['cum_freq'].replace(0,1)

df['next_date']  = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(-1)
df['next_days']  = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend']= df.groupby(['Customer_Phone','SKU_Code'])['spend'].shift(-1)
df['next_qty']  = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].shift(-1)
df['y_buy']     = df['next_days'].notnull().astype(int)

le_sku = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le_sku.transform(df['SKU_Code'])

features = ['recency_days','cum_freq','avg_spend_past','avg_qty_past','sku_enc']

# 3) Train/test split for classifier
X_clf = df[features]
y_clf = df['y_buy']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=20, random_state=42)
clf.fit(Xc_train, yc_train)

# 4) Eval classifier
probs = clf.predict_proba(Xc_test)[:,1]
preds = clf.predict(Xc_test)

print("=== CLASSIFIER METRICS ===")
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

# 5) Prepare train/test for regressors (only positive samples)
df_reg = df[df['y_buy']==1].dropna(subset=['next_days','next_spend','next_qty'])
Xr = df_reg[features]
yd = df_reg['next_days']
ys = df_reg['next_spend']
yq = df_reg['next_qty']

Xr_train, Xr_test, yd_train, yd_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    Xr, yd, ys, yq, test_size=0.2, random_state=42
)

# 6) Train regressors
reg_days  = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, yd_train)
reg_spend = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, ys_train)
reg_qty   = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, yq_train)

# 7) Eval regressors
print("\n=== REGRESSOR METRICS ===")
for name, y_true, model in [
    ("Next-Days", yd_test, reg_days),
    ("Spend",     ys_test, reg_spend),
    ("Quantity",  yq_test, reg_qty)
]:
    y_pred = model.predict(Xr_test)
    print(f"\n-- {name} --")
    print(" MAE :", mean_absolute_error(y_true, y_pred))
    print(" RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print(" R2  :", r2_score(y_true, y_pred))

# 8) (Optional) Continue with your prediction & top-3 extraction…

pred_df = results[['Customer_Phone', 'SKU_Code', 'last_purchase_date',
                   'pred_next_date', 'pred_spend', 'pred_qty', 'probability']]
pred_df


  df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')


=== CLASSIFIER METRICS ===
ROC-AUC: 0.7606125148882928
              precision    recall  f1-score   support

           0       0.68      0.67      0.67      9135
           1       0.72      0.73      0.72     10649

    accuracy                           0.70     19784
   macro avg       0.70      0.70      0.70     19784
weighted avg       0.70      0.70      0.70     19784


=== REGRESSOR METRICS ===

-- Next-Days --
 MAE : 17.539359457817106
 RMSE: 24.46585274359179
 R2  : 0.060901208560988795

-- Spend --
 MAE : 34471.54450148744
 RMSE: 93288.74551944884
 R2  : 0.13613760968158928

-- Quantity --
 MAE : 2.4408128650588745
 RMSE: 6.661168013613155
 R2  : 0.14571210337317997


Unnamed: 0,Customer_Phone,SKU_Code,last_purchase_date,pred_next_date,pred_spend,pred_qty,probability
0,7010009941,10002871,2024-11-28,2024-12-14 14:43:34.718568473,20050.000000,1.023395,0.100000
1,7010009941,10002849,2024-10-03,2024-10-12 07:12:00.000000000,20607.500000,1.000000,0.000000
2,7010009941,10002857,2025-01-02,2025-01-23 22:48:00.000000000,24190.000000,1.100000,0.000000
3,7010147753,10002957,2024-11-28,2024-12-13 16:37:41.149746635,183629.656963,1.617399,0.304423
4,7010147753,10002835,2024-09-09,2024-10-08 04:18:16.049154156,23242.500000,1.159429,0.251098
...,...,...,...,...,...,...,...
11382,9167401267,10003376,2025-01-07,2025-01-31 03:24:24.302564821,19850.000000,1.122391,0.273278
11383,9167401267,10002861,2024-10-14,2024-10-30 01:06:11.361857771,20000.000000,1.020956,0.173211
11384,9169810564,10000533,2024-11-21,2024-12-20 14:35:09.733016040,90317.778957,5.902993,0.428003
11385,9169810564,10002844,2024-12-19,2025-01-09 21:47:19.415802937,24130.000000,1.310417,0.374679


In [None]:
!pip install xgboost



In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)

# 1) Load & clean
df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})
df['delivered_date'] = pd.to_datetime(
    df['delivered_date'], errors='coerce', dayfirst=True
)
df['spend'] = df['spend'].str.replace(',', '').astype(float)
df = df[df['delivered_qty'] > 0].sort_values(
    ['Customer_Phone','SKU_Code','delivered_date']
)

# 2) Feature engineering & targets
df['prev_date'] = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq']   = df.groupby(['Customer_Phone','SKU_Code']).cumcount()
df['cum_spend']  = df.groupby(['Customer_Phone','SKU_Code'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0,1)
df['cum_qty']    = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past']= df['cum_qty'] / df['cum_freq'].replace(0,1)

df['next_date']   = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(-1)
df['next_days']   = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend']  = df.groupby(['Customer_Phone','SKU_Code'])['spend'].shift(-1)
df['next_qty']    = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].shift(-1)
df['y_buy']       = df['next_days'].notnull().astype(int)

le = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le.transform(df['SKU_Code'])

features = ['recency_days','cum_freq','avg_spend_past','avg_qty_past','sku_enc']

# 3) Classifier train/test
Xc = df[features]
yc = df['y_buy']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42
)

clf = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
clf.fit(Xc_train, yc_train)

# 4) Classifier eval
probs = clf.predict_proba(Xc_test)[:,1]
preds = clf.predict(Xc_test)
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

# 5) Regressor train/test on positive samples
df_reg = df[df['y_buy']==1].dropna(subset=['next_days','next_spend','next_qty'])
Xr = df_reg[features]
yr_days  = df_reg['next_days']
yr_spend = df_reg['next_spend']
yr_qty   = df_reg['next_qty']

Xr_train, Xr_test, yd_train, yd_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    Xr, yr_days, yr_spend, yr_qty, test_size=0.2, random_state=42
)

reg_days = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_spend= XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_qty  = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

reg_days.fit(Xr_train, yd_train)
reg_spend.fit(Xr_train, ys_train)
reg_qty.fit(Xr_train, yq_train)

# 6) Regressor eval
for name, y_true, model in [
    ("Next-Days", yd_test, reg_days),
    ("Spend",     ys_test, reg_spend),
    ("Quantity",  yq_test, reg_qty)
]:
    y_pred = model.predict(Xr_test)
    print(f"\n-- {name} --")
    print(" MAE :", mean_absolute_error(y_true, y_pred))
    print(" RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print(" R2  :", r2_score(y_true, y_pred))

# 7) Final top-3 extraction remains the same…

pred_df = results[['Customer_Phone', 'SKU_Code', 'last_purchase_date',
                   'pred_next_date', 'pred_spend', 'pred_qty', 'probability']]
pred_df

  df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')
Parameters: { "use_label_encoder" } are not used.



ROC-AUC: 0.7749623851038586
              precision    recall  f1-score   support

           0       0.70      0.65      0.68      9135
           1       0.72      0.76      0.74     10649

    accuracy                           0.71     19784
   macro avg       0.71      0.71      0.71     19784
weighted avg       0.71      0.71      0.71     19784


-- Next-Days --
 MAE : 16.67879671712265
 RMSE: 23.363590977426714
 R2  : 0.14361361188055288

-- Spend --
 MAE : 33056.48375879503
 RMSE: 93532.5650345117
 R2  : 0.13161612630881137

-- Quantity --
 MAE : 2.3263789571514644
 RMSE: 6.8436803690160115
 R2  : 0.0982567228440826


Unnamed: 0,Customer_Phone,SKU_Code,last_purchase_date,pred_next_date,pred_spend,pred_qty,probability
0,7010009941,10002871,2024-11-28,2024-12-14 14:43:34.718568473,20050.000000,1.023395,0.100000
1,7010009941,10002849,2024-10-03,2024-10-12 07:12:00.000000000,20607.500000,1.000000,0.000000
2,7010009941,10002857,2025-01-02,2025-01-23 22:48:00.000000000,24190.000000,1.100000,0.000000
3,7010147753,10002957,2024-11-28,2024-12-13 16:37:41.149746635,183629.656963,1.617399,0.304423
4,7010147753,10002835,2024-09-09,2024-10-08 04:18:16.049154156,23242.500000,1.159429,0.251098
...,...,...,...,...,...,...,...
11382,9167401267,10003376,2025-01-07,2025-01-31 03:24:24.302564821,19850.000000,1.122391,0.273278
11383,9167401267,10002861,2024-10-14,2024-10-30 01:06:11.361857771,20000.000000,1.020956,0.173211
11384,9169810564,10000533,2024-11-21,2024-12-20 14:35:09.733016040,90317.778957,5.902993,0.428003
11385,9169810564,10002844,2024-12-19,2025-01-09 21:47:19.415802937,24130.000000,1.310417,0.374679


In [None]:
!pip install prophet



In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)
from prophet import Prophet
from IPython.display import display

# 3) Load & clean
df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)
df['spend'] = df['spend'].str.replace(',', '').astype(float)
df = df[df['delivered_qty'] > 0].sort_values(['Customer_Phone','SKU_Code','delivered_date'])

# 4) Feature engineering & targets
df['prev_date'] = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq']   = df.groupby(['Customer_Phone','SKU_Code']).cumcount()
df['cum_spend']  = df.groupby(['Customer_Phone','SKU_Code'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0,1)
df['cum_qty']    = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past']= df['cum_qty'] / df['cum_freq'].replace(0,1)

df['next_date']  = df.groupby(['Customer_Phone','SKU_Code'])['delivered_date'].shift(-1)
df['next_days']  = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend'] = df.groupby(['Customer_Phone','SKU_Code'])['spend'].shift(-1)
df['next_qty']   = df.groupby(['Customer_Phone','SKU_Code'])['delivered_qty'].shift(-1)
df['y_buy']      = df['next_days'].notnull().astype(int)

le = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le.transform(df['SKU_Code'])

features = ['recency_days','cum_freq','avg_spend_past','avg_qty_past','sku_enc']

# 5) Classifier
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    df[features], df['y_buy'], test_size=0.2, random_state=42
)
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric='logloss', random_state=42)
clf.fit(Xc_train, yc_train)
probs = clf.predict_proba(Xc_test)[:,1]
preds = clf.predict(Xc_test)
print("=== CLASSIFIER METRICS ===")
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

# 6) Time-series for next purchase date
ts_preds = []
for (cust, sku), grp in df.groupby(['Customer_Phone','SKU_Code']):
    ts = grp[['delivered_date']].rename(columns={'delivered_date':'ds'})
    ts['y'] = 1
    ts = ts.set_index('ds').resample('D').sum().reset_index()
    if len(ts) < 3: continue
    m = Prophet(daily_seasonality=False, weekly_seasonality=True, yearly_seasonality=False)
    m.fit(ts)
    future = m.make_future_dataframe(periods=60)
    fcst = m.predict(future)
    next_day = fcst.loc[fcst['yhat']>0.5,'ds']
    if not next_day.empty:
        ts_preds.append({'Customer_Phone':cust,'SKU_Code':sku,'ts_pred_next_date':next_day.min()})
ts_df = pd.DataFrame(ts_preds)

# 7) Merge TS preds
latest = df.groupby(['Customer_Phone','SKU_Code']).last().reset_index()
latest = latest.merge(ts_df, on=['Customer_Phone','SKU_Code'], how='left')
latest['last_purchase_date'] = latest['delivered_date']

# 8) Regression split with non-null targets
df_reg = df[df['y_buy']==1].dropna(subset=['next_spend','next_qty'])
Xr = df_reg[features]
ys = df_reg['next_spend']
yq = df_reg['next_qty']
Xr_train, Xr_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    Xr, ys, yq, test_size=0.2, random_state=42
)

reg_spend = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_qty   = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_spend.fit(Xr_train, ys_train)
reg_qty.fit(Xr_train, yq_train)

# 9) Reg metrics
print("=== REGRESSOR METRICS ===")
print("-- Spend -- MAE:", mean_absolute_error(ys_test, reg_spend.predict(Xr_test)),
      "RMSE:", np.sqrt(mean_squared_error(ys_test, reg_spend.predict(Xr_test))),
      "R2:", r2_score(ys_test, reg_spend.predict(Xr_test)))
print("-- Qty   -- MAE:", mean_absolute_error(yq_test, reg_qty.predict(Xr_test)),
      "RMSE:", np.sqrt(mean_squared_error(yq_test, reg_qty.predict(Xr_test))),
      "R2:", r2_score(yq_test, reg_qty.predict(Xr_test)))

# 10) Top-3 extraction
latest['probability']   = clf.predict_proba(latest[features])[:,1]
latest['pred_spend']    = reg_spend.predict(latest[features])
latest['pred_qty']      = reg_qty.predict(latest[features])
latest['pred_next_date']= latest['ts_pred_next_date'].fillna(
    latest['last_purchase_date'] + pd.to_timedelta(latest['next_days'].fillna(0),'D')
)
results = (latest.sort_values(['Customer_Phone','probability'],ascending=[True,False])
            .groupby('Customer_Phone').head(3).reset_index(drop=True))
display(results[['Customer_Phone','SKU_Code','last_purchase_date','pred_next_date','pred_spend','pred_qty','probability']].head(10))

  df = pd.read_csv('/content/drive/MyDrive/Data Analysis - Sample File.csv')


=== CLASSIFIER METRICS ===
ROC-AUC: 0.7749623851038586
              precision    recall  f1-score   support

           0       0.70      0.65      0.68      9135
           1       0.72      0.76      0.74     10649

    accuracy                           0.71     19784
   macro avg       0.71      0.71      0.71     19784
weighted avg       0.71      0.71      0.71     19784



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:prophet:n_changepoints greater than number of observations. Using 4.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpajsmnivi/qwmigjij.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpajsmnivi/kbxyo7qc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12357', 'data', 'file=/tmp/tmpajsmnivi/qwmigjij.json', 'init=/tmp/tmpajsmnivi/kbxyo7qc.json', 'output', 'file=/tmp/tmpajsmnivi/prophet_modelp6chau4h/prophet_model-20250505094755.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
09:47:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
09:47:55 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:n_changepoints greater than number of observations. Using 24.
DEBUG:cmdstanpy:input tempf

=== REGRESSOR METRICS ===
-- Spend -- MAE: 33056.48375879503 RMSE: 93532.5650345117 R2: 0.13161612630881137
-- Qty   -- MAE: 2.3263789571514644 RMSE: 6.8436803690160115 R2: 0.0982567228440826


Unnamed: 0,Customer_Phone,SKU_Code,last_purchase_date,pred_next_date,pred_spend,pred_qty,probability
0,7010009941,10002849,2024-10-03,2024-11-21,27030.351562,1.361134,0.786555
1,7010009941,10002866,2024-12-26,2025-03-20,23784.46875,1.203211,0.462454
2,7010009941,10002857,2025-01-02,2025-04-03,23784.46875,1.180941,0.407207
3,7010147753,10002849,2024-10-16,2024-11-20,26422.642578,1.3216,0.823764
4,7010147753,10002832,2024-10-03,2024-11-09,34583.308594,1.710547,0.770259
5,7010147753,10002866,2024-09-11,2024-09-11,25418.720703,1.346955,0.538113
6,7010301833,10002832,2024-10-22,2024-12-01,76847.632812,5.201844,0.555498
7,7010301833,10002830,2024-10-22,2024-10-22,30868.351562,1.604616,0.491128
8,7010301833,10003376,2025-01-06,2024-12-24,52129.347656,2.648791,0.485204
9,7010572000,10000539,2024-10-23,2024-10-23,86765.75,6.205529,0.711128


In [2]:
# Save the final predictions to CSV
results[['Customer_Phone','SKU_Code','last_purchase_date',
         'pred_next_date','pred_spend','pred_qty','probability']] \
    .to_csv('sku_predictions.csv', index=False)