# Model Development

## Load and Inspect the Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

In [2]:
import warnings
warnings.filterwarnings("ignore", message=".*force_all_finite.*")

In [3]:
cleaned_dataset = pd.read_csv("data/cleaned_dataset.csv")

In [4]:
# make a copy of the dataset
dataset = cleaned_dataset.copy()

## Model development for Returning Customer (Classification Model)

In [5]:
# Parse the 'date' column and extract features
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')
dataset['day_of_week'] = dataset['date'].dt.dayofweek         # 0 = Monday, 6 = Sunday
dataset['month'] = dataset['date'].dt.month
dataset['is_weekend'] = dataset['day_of_week'].isin([5, 6]).astype(int)  # 1 if weekend, else 0

In [6]:
dataset.head()

Unnamed: 0,cus.id,date,cus._location,age,gender,sell_price,does_he_she_come_from_facebook_page,does_he_she_followed_our_page,did_he_she_buy_any_mobile_before,did_he_she_hear_of_our_shop_before,...,mobile_name_redmi_note_12_pro_8_128,mobile_name_vivo_t3x_5g_8_128,mobile_name_vivo_y200_5g_6_128,mobile_name_iphone_16_pro_256gb,mobile_name_iphone_16_pro_max_1tb,mobile_name_iqoo_neo_9_pro_5g_12_256,mobile_name_iqoo_z7_5g_6_128,day_of_week,month,is_weekend
0,1,2024-05-27,Rangamati Sadar,49,0,17073.0,0,1,0,1,...,False,False,False,False,False,False,False,0,5,0
1,2,2024-05-27,Inside Rangamati,44,1,15546.0,1,0,0,1,...,True,False,False,False,False,False,False,0,5,0
2,3,2024-05-27,Rangamati Sadar,45,1,26516.0,1,0,0,1,...,False,False,False,False,False,False,False,0,5,0
3,4,2024-05-27,Rangamati Sadar,46,1,21927.0,0,0,0,1,...,False,False,False,False,False,False,False,0,5,0
4,5,2024-05-27,Outside Rangamati,27,0,16718.0,1,0,0,1,...,False,True,False,False,False,False,False,0,5,0


In [7]:
# Define features (drop non-useful columns and target)
X = dataset.drop(columns=[
    'cus.id', 'date', 'cus._location',  
    'did_he_she_buy_any_mobile_before'  # target variable
])
y = dataset['did_he_she_buy_any_mobile_before']

In [8]:
X

Unnamed: 0,age,gender,sell_price,does_he_she_come_from_facebook_page,does_he_she_followed_our_page,did_he_she_hear_of_our_shop_before,is_local,mobile_name_galaxy_m35_5g_8_128,mobile_name_galaxy_s24_ultra_12_256,mobile_name_moto_g85_5g_8_128,...,mobile_name_redmi_note_12_pro_8_128,mobile_name_vivo_t3x_5g_8_128,mobile_name_vivo_y200_5g_6_128,mobile_name_iphone_16_pro_256gb,mobile_name_iphone_16_pro_max_1tb,mobile_name_iqoo_neo_9_pro_5g_12_256,mobile_name_iqoo_z7_5g_6_128,day_of_week,month,is_weekend
0,49,0,17073.0,0,1,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
1,44,1,15546.0,1,0,1,0,False,False,False,...,True,False,False,False,False,False,False,0,5,0
2,45,1,26516.0,1,0,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
3,46,1,21927.0,0,0,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
4,27,0,16718.0,1,0,1,1,False,False,False,...,False,True,False,False,False,False,False,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8866,34,1,21682.0,1,1,1,1,False,False,False,...,False,False,False,False,False,True,False,5,3,1
8867,48,1,26493.0,0,0,0,0,False,False,True,...,False,False,False,False,False,False,False,5,3,1
8868,30,0,16184.0,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,5,3,1
8869,22,0,16663.0,1,0,1,0,False,False,False,...,False,False,False,False,False,False,False,5,3,1


In [9]:
y

0       0
1       0
2       0
3       0
4       0
       ..
8866    1
8867    0
8868    0
8869    0
8870    1
Name: did_he_she_buy_any_mobile_before, Length: 8871, dtype: int64

In [10]:
X.dtypes.reset_index().rename(columns={'index': 'Feature', 0: 'DataType'})

Unnamed: 0,Feature,DataType
0,age,int64
1,gender,int64
2,sell_price,float64
3,does_he_she_come_from_facebook_page,int64
4,does_he_she_followed_our_page,int64
5,did_he_she_hear_of_our_shop_before,int64
6,is_local,int64
7,mobile_name_galaxy_m35_5g_8_128,bool
8,mobile_name_galaxy_s24_ultra_12_256,bool
9,mobile_name_moto_g85_5g_8_128,bool


### Train/test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Standardize numeric features

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Save scaler
joblib.dump(scaler, "models/returning_customer_scaler.pkl")
print("Scaler saved to models/returning_customer_scaler.pkl")

Scaler saved to models/returning_customer_scaler.pkl


In [14]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("After SMOTE:", Counter(y_train_smote))

After SMOTE: Counter({0: 5341, 1: 5341})


In [15]:
X_train_smote

array([[ 0.82942455,  1.00169253, -0.18326668, ..., -0.98707738,
        -1.65899025, -0.62712431],
       [-0.63455016, -0.99831033, -0.34417647, ...,  1.52180618,
         0.82920292,  1.59458019],
       [ 1.0385638 ,  1.00169253, -0.10128864, ...,  1.02002946,
         1.38213474,  1.59458019],
       ...,
       [-0.32430527, -0.99831033, -0.05941291, ..., -0.48530067,
        -1.65899025, -0.62712431],
       [ 0.519665  ,  0.30984664, -0.24559882, ...,  1.52180618,
        -1.47816011,  1.59458019],
       [-0.38645839, -0.99831033,  0.07989988, ...,  1.26510192,
         0.26986229,  1.59458019]])

In [16]:
y_train_smote

0        0
1        0
2        1
3        0
4        1
        ..
10677    1
10678    1
10679    1
10680    1
10681    1
Name: did_he_she_buy_any_mobile_before, Length: 10682, dtype: int64

### Define and Train Models

In [17]:
models = {
    "Logistic Regression (SMOTE)": LogisticRegression(random_state=42),
    "Random Forest (class_weight)": RandomForestClassifier(class_weight="balanced", random_state=42),
    "XGBoost (scale_pos_weight)": XGBClassifier(
        eval_metric="logloss",
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        random_state=42
    ),
    "LightGBM (is_unbalance)": LGBMClassifier(is_unbalance=True, random_state=42),
    "CatBoost (class_weights)": CatBoostClassifier(
        verbose=0,
        class_weights=[1.0, (y_train.value_counts()[0] / y_train.value_counts()[1])]
    )
}

### Train, predict and evaluate

In [18]:
for name, model in models.items():
    print(f"\n==== {name} ====")
    
    if "SMOTE" in name:
        model.fit(X_train_smote, y_train_smote)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        
        if "CatBoost" in name:
            joblib.dump(model, "models/returning_customer.pkl")
            print("✅ CatBoost saved to models/returning_customer.pkl")
        
    
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))


==== Logistic Regression (SMOTE) ====
              precision    recall  f1-score   support

           0       0.76      0.54      0.63      1336
           1       0.26      0.49      0.34       439

    accuracy                           0.53      1775
   macro avg       0.51      0.52      0.49      1775
weighted avg       0.64      0.53      0.56      1775

ROC-AUC: 0.5131

==== Random Forest (class_weight) ====
              precision    recall  f1-score   support

           0       0.75      0.98      0.85      1336
           1       0.20      0.01      0.03       439

    accuracy                           0.74      1775
   macro avg       0.48      0.50      0.44      1775
weighted avg       0.62      0.74      0.65      1775

ROC-AUC: 0.5226

==== XGBoost (scale_pos_weight) ====
              precision    recall  f1-score   support

           0       0.75      0.71      0.73      1336
           1       0.25      0.29      0.27       439

    accuracy                     