<a href="https://colab.research.google.com/github/boiBASH/Tolaram_Project/blob/main/ML_Model_for_Customer_Profilling_unmasked.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (roc_auc_score, classification_report,
                            mean_absolute_error, mean_squared_error, r2_score)

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')

# Rename columns to be consistent
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})

# Convert 'delivered_date' to datetime objects
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)

# Clean 'spend' column
df['spend'] = df['spend'].astype(str).str.replace(',', '', regex=False).astype(float)

# Filter out rows with zero or negative delivered quantity
df = df[df['delivered_qty'] > 0].sort_values(['Customer_Phone', 'SKU_Code', 'Brand', 'delivered_date'])

# Feature Engineering (grouping by Customer_Phone) for Brand Prediction
df['prev_brand'] = df.groupby('Customer_Phone')['Brand'].shift(1)
df['last_brand'] = df.groupby('Customer_Phone')['Brand'].transform('last')
df['brand_change'] = (df['Brand'] != df['prev_brand']).astype(int)
df['brand_buy_count'] = df.groupby(['Customer_Phone', 'Brand'])['SKU_Code'].transform('count')

# Feature Engineering (grouping by Customer_Phone, SKU_Code, and Brand) for Item Prediction
df['prev_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).cumcount()
df['cum_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0, 1)
df['cum_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past'] = df['cum_qty'] / df['cum_freq'].replace(0, 1)

# Compute Targets for Item Prediction
df['next_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(-1)
df['next_days'] = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].shift(-1)
df['next_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].shift(-1)
df['y_buy'] = df['next_days'].notnull().astype(int)

# Encode SKU and Brand
le_sku = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le_sku.transform(df['SKU_Code'])
le_brand = LabelEncoder().fit(df['Brand'])
df['brand_enc'] = le_brand.transform(df['Brand'])

# Prepare data for Brand Prediction
brand_features = ['recency_days', 'brand_buy_count', 'brand_change'] # Using some existing and new features
X_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
y_brand = df.groupby('Customer_Phone')['Brand'].last().loc[X_brand.index]
le_brand_prediction = LabelEncoder().fit(y_brand)
y_brand_enc = le_brand_prediction.transform(y_brand)
X_brand_train, X_brand_test, y_brand_train, y_brand_test = train_test_split(X_brand, y_brand_enc, test_size=0.2, random_state=42)
brand_clf = RandomForestClassifier(n_estimators=20, random_state=42).fit(X_brand_train, y_brand_train)
print("=== BRAND PREDICTION CLASSIFIER METRICS ===")
print("Brand Prediction Accuracy:", brand_clf.score(X_brand_test, y_brand_test))

# Prepare data and train models for Item Prediction
item_features = ['recency_days', 'cum_freq', 'avg_spend_past', 'avg_qty_past', 'sku_enc', 'brand_enc']
X_clf = df[item_features].dropna()
y_clf = df['y_buy'].loc[X_clf.index]
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=20, random_state=42).fit(Xc_train, yc_train)

# Evaluate Item Prediction Classifier
probs = clf.predict_proba(Xc_test)[:, 1]
preds = clf.predict(Xc_test)
print("\n=== ITEM PREDICTION CLASSIFIER METRICS ===")
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

df_reg = df[df['y_buy'] == 1].dropna(subset=['next_days', 'next_spend', 'next_qty'] + item_features)
X_reg = df_reg[item_features]
y_days = df_reg['next_days']
y_spend = df_reg['next_spend']
y_qty = df_reg['next_qty']

Xr_train, Xr_test, yd_train, yd_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    X_reg, y_days, y_spend, y_qty, test_size=0.2, random_state=42
)

reg_days = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, yd_train)
reg_spend = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, ys_train)
reg_qty = RandomForestRegressor(n_estimators=20, random_state=42).fit(Xr_train, yq_train)

# Evaluate Item Prediction Regressors
print("\n=== ITEM PREDICTION REGRESSOR METRICS ===")
for name, y_true, model in [
    ("Next-Days", yd_test, reg_days),
    ("Spend", ys_test, reg_spend),
    ("Quantity", yq_test, reg_qty)
]:
    y_pred = model.predict(Xr_test)
    print(f"\n-- {name} --")
    print(" MAE :", mean_absolute_error(y_true, y_pred))
    print(" RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print(" R2  :", r2_score(y_true, y_pred))

# Predict on latest snapshot for Item Prediction
latest_item = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).last().reset_index()
latest_item['last_purchase_date'] = latest_item['delivered_date']
latest_item['recency_days'] = (pd.Timestamp.now() - latest_item['delivered_date']).dt.days
latest_item['sku_enc'] = le_sku.transform(latest_item['SKU_Code'])
latest_item['brand_enc'] = le_brand.transform(latest_item['Brand'])
latest_item['cum_freq'] = latest_item['cum_freq']
latest_item['avg_spend_past'] = latest_item['avg_spend_past']
latest_item['avg_qty_past'] = latest_item['avg_qty_past']

X_pred_item = latest_item[item_features].dropna()
latest_item = latest_item.loc[X_pred_item.index].copy() # Keep only rows with valid features
latest_item['probability'] = clf.predict_proba(X_pred_item)[:, 1]
latest_item['pred_days'] = reg_days.predict(X_pred_item)
latest_item['pred_next_date'] = latest_item['last_purchase_date'] + pd.to_timedelta(latest_item['pred_days'], unit='D')
latest_item['pred_spend'] = reg_spend.predict(X_pred_item)
latest_item['pred_qty'] = reg_qty.predict(X_pred_item)

# Predict Likely Next Brand for each Customer
latest_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
pred_brand_encoded = brand_clf.predict(latest_brand)
pred_brand = le_brand_prediction.inverse_transform(pred_brand_encoded)
pred_brand_df = pd.DataFrame({'Customer_Phone': latest_brand.index, 'pred_next_brand': pred_brand})

# Merge Brand Predictions with Item Predictions
results = (latest_item.sort_values(['Customer_Phone', 'probability'], ascending=[True, False])
                       .groupby('Customer_Phone').head(3).reset_index(drop=True))

pred_df = pd.merge(results, pred_brand_df, on='Customer_Phone', how='left')


# Display the relevant prediction columns (Including Predicted Next Brand)
print("\n=== TOP 3 LIKELY NEXT PURCHASES WITH PREDICTED NEXT BRAND ===")

pred_df[['Customer_Phone', 'SKU_Code', 'Brand', 'last_purchase_date',
               'pred_next_date', 'pred_next_brand', 'pred_spend', 'pred_qty', 'probability']]

  df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')


=== BRAND PREDICTION CLASSIFIER METRICS ===
Brand Prediction Accuracy: 0.4312977099236641

=== ITEM PREDICTION CLASSIFIER METRICS ===
ROC-AUC: 0.7595757018329582
              precision    recall  f1-score   support

           0       0.68      0.67      0.67      9136
           1       0.72      0.73      0.72     10648

    accuracy                           0.70     19784
   macro avg       0.70      0.70      0.70     19784
weighted avg       0.70      0.70      0.70     19784


=== ITEM PREDICTION REGRESSOR METRICS ===

-- Next-Days --
 MAE : 17.42809909595842
 RMSE: 24.22985137658936
 R2  : 0.06876770341719929

-- Spend --
 MAE : 35453.55440328573
 RMSE: 113395.89584650511
 R2  : 0.09641035167389167

-- Quantity --
 MAE : 2.5521819120937557
 RMSE: 8.523112907949033
 R2  : 0.13561280774841722

=== TOP 3 LIKELY NEXT PURCHASES WITH PREDICTED NEXT BRAND ===


Unnamed: 0,Customer_Phone,SKU_Code,Brand,last_purchase_date,pred_next_date,pred_next_brand,pred_spend,pred_qty,probability
0,7010009941,10002849,MUNCH IT,2024-10-03,2024-10-22 19:12:00.000000000,KELLOGGS,20600.000000,1.000000,0.000000
1,7010009941,10002857,MUNCH IT,2025-01-02,2025-01-25 03:36:00.000000000,KELLOGGS,20275.000000,1.000000,0.000000
2,7010009941,10002866,MUNCH IT,2024-12-26,2025-01-13 12:00:00.000000000,KELLOGGS,20600.000000,1.150000,0.000000
3,7010147753,10002835,KELLOGGS,2024-09-09,2024-10-11 05:25:49.493318445,KELLOGGS,51593.113180,1.450000,0.293582
4,7010147753,10002957,COLGATE,2024-11-28,2024-12-20 20:06:02.798096071,KELLOGGS,173414.943117,1.263247,0.167054
...,...,...,...,...,...,...,...,...,...
11382,9167401267,10002830,KELLOGGS,2024-10-22,2024-11-25 06:07:08.809630898,KELLOGGS,52634.390814,1.370834,0.167916
11383,9167401267,10003376,KELLOGGS,2025-01-07,2025-01-31 11:02:48.642134167,KELLOGGS,21400.000000,1.167040,0.153322
11384,9169810564,10000533,POWER OIL,2024-11-21,2024-12-22 04:32:10.458397547,KELLOGGS,104734.923952,5.987655,0.256871
11385,9169810564,10003395,KELLOGGS,2025-01-21,2025-02-14 21:29:13.541234979,KELLOGGS,27236.750862,1.266003,0.215214


In [10]:
!pip install xgboost



In [13]:
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')

# Rename columns to be consistent
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})

# Convert 'delivered_date' to datetime objects
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)

# Clean 'spend' column
df['spend'] = df['spend'].astype(str).str.replace(',', '', regex=False).astype(float)

# Filter out rows with zero or negative delivered quantity
df = df[df['delivered_qty'] > 0].sort_values(['Customer_Phone', 'SKU_Code', 'Brand', 'delivered_date'])

# Feature Engineering (grouping by Customer_Phone) for Brand Prediction
df['prev_brand'] = df.groupby('Customer_Phone')['Brand'].shift(1)
df['last_brand'] = df.groupby('Customer_Phone')['Brand'].transform('last')
df['brand_change'] = (df['Brand'] != df['prev_brand']).astype(int)
df['brand_buy_count'] = df.groupby(['Customer_Phone', 'Brand'])['SKU_Code'].transform('count')

# Feature Engineering (grouping by Customer_Phone, SKU_Code, and Brand) for Item Prediction
df['prev_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).cumcount()
df['cum_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0, 1)
df['cum_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past'] = df['cum_qty'] / df['cum_freq'].replace(0, 1)

# Compute Targets for Item Prediction
df['next_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(-1)
df['next_days'] = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].shift(-1)
df['next_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].shift(-1)
df['y_buy'] = df['next_days'].notnull().astype(int)

# Encode SKU and Brand
le_sku = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le_sku.transform(df['SKU_Code'])
le_brand = LabelEncoder().fit(df['Brand'])
df['brand_enc'] = le_brand.transform(df['Brand'])

# Prepare data for Brand Prediction
brand_features = ['recency_days', 'brand_buy_count', 'brand_change']
X_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
y_brand = df.groupby('Customer_Phone')['Brand'].last().loc[X_brand.index]
le_brand_prediction = LabelEncoder().fit(y_brand)
y_brand_enc = le_brand_prediction.transform(y_brand)
X_brand_train, X_brand_test, y_brand_train, y_brand_test = train_test_split(X_brand, y_brand_enc, test_size=0.2, random_state=42)
brand_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42)
brand_clf.fit(X_brand_train, y_brand_train)
print("=== BRAND PREDICTION CLASSIFIER METRICS (XGBoost) ===")
print("Brand Prediction Accuracy:", brand_clf.score(X_brand_test, y_brand_test))

# Prepare data and train models for Item Prediction
item_features = ['recency_days', 'cum_freq', 'avg_spend_past', 'avg_qty_past', 'sku_enc', 'brand_enc']
X_clf = df[item_features].dropna()
y_clf = df['y_buy'].loc[X_clf.index]
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(Xc_train, yc_train)

# Evaluate Item Prediction Classifier
probs = clf.predict_proba(Xc_test)[:, 1]
preds = clf.predict(Xc_test)
print("\n=== ITEM PREDICTION CLASSIFIER METRICS (XGBoost) ===")
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

df_reg = df[df['y_buy'] == 1].dropna(subset=['next_days', 'next_spend', 'next_qty'] + item_features)
X_reg = df_reg[item_features]
y_days = df_reg['next_days']
y_spend = df_reg['next_spend']
y_qty = df_reg['next_qty']

Xr_train, Xr_test, yd_train, yd_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    X_reg, y_days, y_spend, y_qty, test_size=0.2, random_state=42
)

reg_days = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_spend = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_qty = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

reg_days.fit(Xr_train, yd_train)
reg_spend.fit(Xr_train, ys_train)
reg_qty.fit(Xr_train, yq_train)

# Evaluate Item Prediction Regressors
print("\n=== ITEM PREDICTION REGRESSOR METRICS (XGBoost) ===")
for name, y_true, model in [
    ("Next-Days", yd_test, reg_days),
    ("Spend", ys_test, reg_spend),
    ("Quantity", yq_test, reg_qty)
]:
    y_pred = model.predict(Xr_test)
    print(f"\n-- {name} --")
    print(" MAE :", mean_absolute_error(y_true, y_pred))
    print(" RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print(" R2  :", r2_score(y_true, y_pred))

# Predict Likely Next Brand for each Customer
latest_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
pred_brand_encoded = brand_clf.predict(latest_brand)
pred_brand = le_brand_prediction.inverse_transform(pred_brand_encoded)
pred_brand_df = pd.DataFrame({'Customer_Phone': latest_brand.index, 'pred_next_brand': pred_brand})

# Predict on latest snapshot for Item Prediction
latest_item = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).last().reset_index()
latest_item['last_purchase_date'] = latest_item['delivered_date']
latest_item['recency_days'] = (pd.Timestamp.now() - latest_item['delivered_date']).dt.days
latest_item['sku_enc'] = le_sku.transform(latest_item['SKU_Code'])
latest_item['brand_enc'] = le_brand.transform(latest_item['Brand'])
latest_item['cum_freq'] = latest_item['cum_freq']
latest_item['avg_spend_past'] = latest_item['avg_spend_past']
latest_item['avg_qty_past'] = latest_item['avg_qty_past']

X_pred_item = latest_item[item_features].dropna()
latest_item = latest_item.loc[X_pred_item.index].copy()
latest_item['probability'] = clf.predict_proba(X_pred_item)[:, 1]
latest_item['pred_days'] = reg_days.predict(X_pred_item)
latest_item['pred_next_date'] = latest_item['last_purchase_date'] + pd.to_timedelta(latest_item['pred_days'], unit='D')
latest_item['pred_spend'] = reg_spend.predict(X_pred_item)
latest_item['pred_qty'] = reg_qty.predict(X_pred_item)

# Merge Brand Predictions with Item Predictions
results = (latest_item.sort_values(['Customer_Phone', 'probability'], ascending=[True, False])
                       .groupby('Customer_Phone').head(3).reset_index(drop=True))

pred_df = pd.merge(results, pred_brand_df, on='Customer_Phone', how='left')

# Display the relevant prediction columns (Including Predicted Next Brand)
print("\n=== TOP 3 LIKELY NEXT PURCHASES WITH PREDICTED NEXT BRAND (XGBoost) ===")
pred_df[['Customer_Phone', 'SKU_Code', 'Brand', 'pred_next_brand', 'last_purchase_date',
               'pred_next_date', 'pred_spend', 'pred_qty', 'probability']]

  df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')
Parameters: { "use_label_encoder" } are not used.



=== BRAND PREDICTION CLASSIFIER METRICS (XGBoost) ===
Brand Prediction Accuracy: 0.4491094147582697


Parameters: { "use_label_encoder" } are not used.




=== ITEM PREDICTION CLASSIFIER METRICS (XGBoost) ===
ROC-AUC: 0.7744015972100695
              precision    recall  f1-score   support

           0       0.70      0.65      0.67      9136
           1       0.72      0.76      0.74     10648

    accuracy                           0.71     19784
   macro avg       0.71      0.71      0.71     19784
weighted avg       0.71      0.71      0.71     19784


=== ITEM PREDICTION REGRESSOR METRICS (XGBoost) ===

-- Next-Days --
 MAE : 16.634910432687818
 RMSE: 23.19802217461935
 R2  : 0.14639205710206304

-- Spend --
 MAE : 33041.39654981993
 RMSE: 107637.19214113342
 R2  : 0.1858558729226465

-- Quantity --
 MAE : 2.356125326892106
 RMSE: 8.17611827919185
 R2  : 0.20456229951951221

=== TOP 3 LIKELY NEXT PURCHASES WITH PREDICTED NEXT BRAND (XGBoost) ===


Unnamed: 0,Customer_Phone,SKU_Code,Brand,pred_next_brand,last_purchase_date,pred_next_date,pred_spend,pred_qty,probability
0,7010009941,10002866,MUNCH IT,KELLOGGS,2024-12-26,2025-01-13 15:36:59.622802738,29060.457031,1.178499,0.099829
1,7010009941,10002849,MUNCH IT,KELLOGGS,2024-10-03,2024-10-25 23:55:47.369384765,20781.744141,0.950910,0.078411
2,7010009941,10002871,MUNCH IT,KELLOGGS,2024-11-28,2024-12-13 20:55:49.511718747,19633.320312,1.565267,0.063043
3,7010147753,10002832,KELLOGGS,KELLOGGS,2024-10-03,2024-10-12 09:14:18.499145504,27159.564453,1.356787,0.114594
4,7010147753,10002835,KELLOGGS,KELLOGGS,2024-09-09,2024-09-25 04:30:48.449707027,17230.509766,1.480236,0.081987
...,...,...,...,...,...,...,...,...,...
11382,9167401267,10002861,MUNCH IT,KELLOGGS,2024-10-14,2024-10-30 08:07:40.491943360,19633.320312,1.565267,0.076860
11383,9167401267,10002849,MUNCH IT,KELLOGGS,2024-12-02,2024-12-11 00:00:04.696655270,24130.693359,1.218952,0.060114
11384,9169810564,10000533,POWER OIL,KELLOGGS,2024-11-21,2024-12-15 02:56:45.706787108,81822.796875,3.238137,0.117004
11385,9169810564,10000539,POWER OIL,KELLOGGS,2025-01-29,2025-02-14 16:05:49.969482423,78058.984375,4.604372,0.111577


In [14]:
!pip install prophet



In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, classification_report,
    mean_absolute_error, mean_squared_error, r2_score
)
from prophet import Prophet
from IPython.display import display

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')

# Rename columns to be consistent
df = df.rename(columns={
    'Delivered_date': 'delivered_date',
    'Delivered Qty': 'delivered_qty',
    'Redistribution Value': 'spend'
})

# Convert 'delivered_date' to datetime objects
df['delivered_date'] = pd.to_datetime(df['delivered_date'], errors='coerce', dayfirst=True)

# Clean 'spend' column
df['spend'] = df['spend'].astype(str).str.replace(',', '', regex=False).astype(float)

# Filter out rows with zero or negative delivered quantity
df = df[df['delivered_qty'] > 0].sort_values(['Customer_Phone', 'SKU_Code', 'Brand', 'delivered_date'])

# Feature Engineering (grouping by Customer_Phone) for Brand Prediction
df['prev_brand'] = df.groupby('Customer_Phone')['Brand'].shift(1)
df['last_brand'] = df.groupby('Customer_Phone')['Brand'].transform('last')
df['brand_change'] = (df['Brand'] != df['prev_brand']).astype(int)
df['brand_buy_count'] = df.groupby(['Customer_Phone', 'Brand'])['SKU_Code'].transform('count')

# Feature Engineering (grouping by Customer_Phone, SKU_Code, and Brand) for Item Prediction
df['prev_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(1)
df['recency_days'] = (df['delivered_date'] - df['prev_date']).dt.days.fillna(-1)
df['cum_freq'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).cumcount()
df['cum_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].cumsum() - df['spend']
df['avg_spend_past'] = df['cum_spend'] / df['cum_freq'].replace(0, 1)
df['cum_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].cumsum() - df['delivered_qty']
df['avg_qty_past'] = df['cum_qty'] / df['cum_freq'].replace(0, 1)

# Compute Targets for Item Prediction
df['next_date'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_date'].shift(-1)
df['next_days'] = (df['next_date'] - df['delivered_date']).dt.days
df['next_spend'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['spend'].shift(-1)
df['next_qty'] = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])['delivered_qty'].shift(-1)
df['y_buy'] = df['next_days'].notnull().astype(int)

# Encode SKU and Brand
le_sku = LabelEncoder().fit(df['SKU_Code'])
df['sku_enc'] = le_sku.transform(df['SKU_Code'])
le_brand = LabelEncoder().fit(df['Brand'])
df['brand_enc'] = le_brand.transform(df['Brand'])

# Prepare data for Brand Prediction
brand_features = ['recency_days', 'brand_buy_count', 'brand_change']
X_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
y_brand = df.groupby('Customer_Phone')['Brand'].last().loc[X_brand.index]
le_brand_prediction = LabelEncoder().fit(y_brand)
y_brand_enc = le_brand_prediction.transform(y_brand)
X_brand_train, X_brand_test, y_brand_train, y_brand_test = train_test_split(X_brand, y_brand_enc, test_size=0.2, random_state=42)
brand_clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42)
brand_clf.fit(X_brand_train, y_brand_train)
print("=== BRAND PREDICTION CLASSIFIER METRICS (XGBoost) ===")
print("Brand Prediction Accuracy:", brand_clf.score(X_brand_test, y_brand_test))

# Prepare data and train models for Item Prediction
item_features = ['recency_days', 'cum_freq', 'avg_spend_past', 'avg_qty_past', 'sku_enc', 'brand_enc']
X_clf = df[item_features].dropna()
y_clf = df['y_buy'].loc[X_clf.index]
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(Xc_train, yc_train)

# Evaluate Item Prediction Classifier
probs = clf.predict_proba(Xc_test)[:, 1]
preds = clf.predict(Xc_test)
print("\n=== ITEM PREDICTION CLASSIFIER METRICS (XGBoost) ===")
print("ROC-AUC:", roc_auc_score(yc_test, probs))
print(classification_report(yc_test, preds))

df_reg = df[df['y_buy'] == 1].dropna(subset=['next_days', 'next_spend', 'next_qty'] + item_features)
X_reg = df_reg[item_features]
y_days = df_reg['next_days']
y_spend = df_reg['next_spend']
y_qty = df_reg['next_qty']

Xr_train, Xr_test, yd_train, yd_test, ys_train, ys_test, yq_train, yq_test = train_test_split(
    X_reg, y_days, y_spend, y_qty, test_size=0.2, random_state=42
)

reg_days = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_spend = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
reg_qty = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

reg_days.fit(Xr_train, yd_train)
reg_spend.fit(Xr_train, ys_train)
reg_qty.fit(Xr_train, yq_train)

# Evaluate Item Prediction Regressors
print("\n=== ITEM PREDICTION REGRESSOR METRICS (XGBoost) ===")
for name, y_true, model in [
    ("Next-Days", yd_test, reg_days),
    ("Spend", ys_test, reg_spend),
    ("Quantity", yq_test, reg_qty)
]:
    y_pred = model.predict(Xr_test)
    print(f"\n-- {name} --")
    print(" MAE :", mean_absolute_error(y_true, y_pred))
    print(" RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print(" R2  :", r2_score(y_true, y_pred))

# Predict Likely Next Brand for each Customer
latest_brand = df.groupby('Customer_Phone')[brand_features].last().dropna()
pred_brand_encoded = brand_clf.predict(latest_brand)
pred_brand = le_brand_prediction.inverse_transform(pred_brand_encoded)
pred_brand_df = pd.DataFrame({'Customer_Phone': latest_brand.index, 'pred_next_brand': pred_brand})


# Predict on latest snapshot for Item Prediction
latest_item = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand']).last().reset_index()
latest_item['last_purchase_date'] = latest_item['delivered_date']
latest_item['recency_days'] = (pd.Timestamp.now() - latest_item['delivered_date']).dt.days
latest_item['sku_enc'] = le_sku.transform(latest_item['SKU_Code'])
latest_item['brand_enc'] = le_brand.transform(latest_item['Brand'])
latest_item['cum_freq'] = latest_item['cum_freq']
latest_item['avg_spend_past'] = latest_item['avg_spend_past']
latest_item['avg_qty_past'] = latest_item['avg_qty_past']

X_pred_item = latest_item[item_features].dropna()
latest_item['probability'] = clf.predict_proba(X_pred_item)[:, 1]
latest_item['pred_days'] = reg_days.predict(X_pred_item)


# 6) Time-series for next purchase date (Prophet)
ts_preds = []
grouped = df.groupby(['Customer_Phone', 'SKU_Code', 'Brand'])

for name, group in grouped:
    if len(group) >= 3:
        ts = pd.DataFrame()
        ts['ds'] = group['delivered_date']
        ts['y'] = 1
        ts = ts.set_index('ds') # Make 'ds' the index for resampling
        ts = ts.resample('D').sum().fillna(0)
        ts = ts[ts['y'] > 0].reset_index()

        model = Prophet(daily_seasonality=False, weekly_seasonality=True)
        try:
            model.fit(ts)
            future = model.make_future_dataframe(periods=60)
            forecast = model.predict(future)
            next_purchase_date = forecast[forecast['yhat'] > 0.5]['ds'].min()
            if pd.notna(next_purchase_date):
                ts_preds.append({
                    'Customer_Phone': name[0],
                    'SKU_Code': name[1],
                    'Brand': name[2],  # Include Brand in the Prophet predictions
                    'ts_pred_next_date': next_purchase_date
                })
        except Exception as e:
            print(f"Prophet error for {name}: {e}")

ts_df = pd.DataFrame(ts_preds)

# Merge Prophet predictions
latest_item = pd.merge(latest_item, ts_df, on=['Customer_Phone', 'SKU_Code', 'Brand'], how='left')
latest_item['pred_next_date'] = latest_item['ts_pred_next_date'].fillna(
    latest_item['last_purchase_date'] + pd.to_timedelta(latest_item['pred_days'], unit='D')
)
latest_item['pred_spend'] = reg_spend.predict(X_pred_item)
latest_item['pred_qty'] = reg_qty.predict(X_pred_item)

# Merge Brand Predictions with Item Predictions
results = (latest_item.sort_values(['Customer_Phone', 'probability'], ascending=[True, False])
                       .groupby('Customer_Phone').head(3).reset_index(drop=True))

pred_df = pd.merge(results, pred_brand_df, on='Customer_Phone', how='left')


# Display the relevant prediction columns (Including Predicted Next Brand)
print("\n=== TOP 3 LIKELY NEXT PURCHASES WITH PREDICTED NEXT BRAND (XGBoost + Prophet) ===")
display(pred_df[['Customer_Phone', 'SKU_Code', 'Brand', 'pred_next_brand', 'last_purchase_date',
               'pred_next_date', 'pred_spend', 'pred_qty', 'probability']].head(10))


  df = pd.read_csv('/content/drive/MyDrive/TOLARAM/Data sample analysis.csv', encoding='latin-1')
Parameters: { "use_label_encoder" } are not used.



=== BRAND PREDICTION CLASSIFIER METRICS (XGBoost) ===
Brand Prediction Accuracy: 0.4491094147582697


Parameters: { "use_label_encoder" } are not used.




=== ITEM PREDICTION CLASSIFIER METRICS (XGBoost) ===
ROC-AUC: 0.7744015972100695
              precision    recall  f1-score   support

           0       0.70      0.65      0.67      9136
           1       0.72      0.76      0.74     10648

    accuracy                           0.71     19784
   macro avg       0.71      0.71      0.71     19784
weighted avg       0.71      0.71      0.71     19784


=== ITEM PREDICTION REGRESSOR METRICS (XGBoost) ===

-- Next-Days --
 MAE : 16.634910432687818
 RMSE: 23.19802217461935
 R2  : 0.14639205710206304

-- Spend --
 MAE : 33041.39654981993
 RMSE: 107637.19214113342
 R2  : 0.1858558729226465

-- Quantity --
 MAE : 2.356125326892106
 RMSE: 8.17611827919185
 R2  : 0.20456229951951221


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:prophet:n_changepoints greater than number of observations. Using 2.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:n_changepoints greater than number of observations. Using 2.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:n_changepoints greater than number of observations. Using 1.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:n_changepoints greater than number of observations. Using 0.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpkkx6mcow/m44vbkkf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpkkx6mcow/k3hkvb92.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48953',

In [None]:
# Define the major columns you want to save
major_columns = [
    'Customer_Phone',
    'SKU_Code',
    'Brand',
    'pred_next_brand',
    'last_purchase_date',
    'pred_next_date',
    'pred_spend',
    'pred_qty',
    'probability'
]

# Select only the major columns from the prediction DataFrame
pred_df_major = pred_df[major_columns]

# Save the results to a CSV file with only the major columns
pred_df_major.to_csv('purchase_predictions_major.csv', index=False)
print(f"\nResults with major columns saved to purchase_predictions_major.csv")
