# Model Development

## Load and Inspect the Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
import warnings
warnings.filterwarnings("ignore", message=".*force_all_finite.*")

In [3]:
cleaned_dataset = pd.read_csv("data/cleaned_dataset.csv")

In [4]:
# make a copy of the dataset
dataset = cleaned_dataset.copy()

## Model development for Returning Customer (Classification Model)

In [5]:
# Parse the 'date' column and extract features
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')
dataset['day_of_week'] = dataset['date'].dt.dayofweek         # 0 = Monday, 6 = Sunday
dataset['month'] = dataset['date'].dt.month
dataset['is_weekend'] = dataset['day_of_week'].isin([5, 6]).astype(int)  # 1 if weekend, else 0

In [6]:
dataset.head()

Unnamed: 0,cus.id,date,cus._location,age,gender,sell_price,does_he/she_come_from_facebook_page,does_he/she_followed_our_page,did_he/she_buy_any_mobile_before,did_he/she_hear_of_our_shop_before,...,mobile_name_Redmi Note 12 Pro 8/128,mobile_name_Vivo T3x 5G 8/128,mobile_name_Vivo Y200 5G 6/128,mobile_name_iPhone 16 Pro 256GB,mobile_name_iPhone 16 Pro Max 1TB,mobile_name_iQOO Neo 9 Pro 5G 12/256,mobile_name_iQOO Z7 5G 6/128,day_of_week,month,is_weekend
0,1,2024-05-27,Rangamati Sadar,49,0,17073.0,0,1,0,1,...,False,False,False,False,False,False,False,0,5,0
1,2,2024-05-27,Inside Rangamati,44,1,15546.0,1,0,0,1,...,True,False,False,False,False,False,False,0,5,0
2,3,2024-05-27,Rangamati Sadar,45,1,26516.0,1,0,0,1,...,False,False,False,False,False,False,False,0,5,0
3,4,2024-05-27,Rangamati Sadar,46,1,21927.0,0,0,0,1,...,False,False,False,False,False,False,False,0,5,0
4,5,2024-05-27,Outside Rangamati,27,0,16718.0,1,0,0,1,...,False,True,False,False,False,False,False,0,5,0


In [7]:
# Define features (drop non-useful columns and target)
X = dataset.drop(columns=[
    'cus.id', 'date', 'cus._location',  
    'does_he/she_come_from_facebook_page'  # target variable
])
y = dataset['does_he/she_come_from_facebook_page']

In [8]:
X

Unnamed: 0,age,gender,sell_price,does_he/she_followed_our_page,did_he/she_buy_any_mobile_before,did_he/she_hear_of_our_shop_before,is_local,mobile_name_Galaxy M35 5G 8/128,mobile_name_Galaxy S24 Ultra 12/256,mobile_name_Moto G85 5G 8/128,...,mobile_name_Redmi Note 12 Pro 8/128,mobile_name_Vivo T3x 5G 8/128,mobile_name_Vivo Y200 5G 6/128,mobile_name_iPhone 16 Pro 256GB,mobile_name_iPhone 16 Pro Max 1TB,mobile_name_iQOO Neo 9 Pro 5G 12/256,mobile_name_iQOO Z7 5G 6/128,day_of_week,month,is_weekend
0,49,0,17073.0,1,0,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
1,44,1,15546.0,0,0,1,0,False,False,False,...,True,False,False,False,False,False,False,0,5,0
2,45,1,26516.0,0,0,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
3,46,1,21927.0,0,0,1,0,False,False,False,...,False,False,False,False,False,False,False,0,5,0
4,27,0,16718.0,0,0,1,1,False,False,False,...,False,True,False,False,False,False,False,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8866,34,1,21682.0,1,1,1,1,False,False,False,...,False,False,False,False,False,True,False,5,3,1
8867,48,1,26493.0,0,0,0,0,False,False,True,...,False,False,False,False,False,False,False,5,3,1
8868,30,0,16184.0,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,5,3,1
8869,22,0,16663.0,0,0,1,0,False,False,False,...,False,False,False,False,False,False,False,5,3,1


In [9]:
y

0       0
1       1
2       1
3       0
4       1
       ..
8866    1
8867    0
8868    0
8869    1
8870    1
Name: does_he/she_come_from_facebook_page, Length: 8871, dtype: int64

### Train/test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Standardize numeric features

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("After SMOTE:", Counter(y_train_smote))

After SMOTE: Counter({0: 4624, 1: 4624})


In [13]:
X_train_smote

array([[-1.47080707,  1.00197489, -0.02074348, ..., -1.49943593,
         1.38792448, -0.6269062 ],
       [ 1.13548562,  1.00197489,  2.10910314, ...,  0.51695623,
         1.38792448, -0.6269062 ],
       [ 1.23973733,  1.00197489, -0.07308004, ..., -0.49123985,
        -1.65889706, -0.6269062 ],
       ...,
       [ 0.00736813,  0.48904777, -0.09866215, ...,  1.52515231,
         1.10378006,  1.59513496],
       [ 0.26864476, -0.998029  , -0.52453109, ..., -0.16625779,
        -0.87155461, -0.6269062 ],
       [ 0.61422708, -0.998029  , -0.53325256, ..., -1.33509705,
        -1.4722116 , -0.6269062 ]])

In [14]:
y_train_smote

0       0
1       0
2       1
3       1
4       0
       ..
9243    1
9244    1
9245    1
9246    1
9247    1
Name: does_he/she_come_from_facebook_page, Length: 9248, dtype: int64

### Define and Train Models

In [15]:
models = {
    "Logistic Regression (SMOTE)": LogisticRegression(random_state=42),
    "Random Forest (class_weight)": RandomForestClassifier(class_weight="balanced", random_state=42),
    "XGBoost (scale_pos_weight)": XGBClassifier(
        eval_metric="logloss",
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        random_state=42
    ),
    "LightGBM (is_unbalance)": LGBMClassifier(is_unbalance=True, random_state=42),
    "CatBoost (class_weights)": CatBoostClassifier(
        verbose=0,
        class_weights=[1.0, (y_train.value_counts()[0] / y_train.value_counts()[1])]
    )
}

### Train, predict and evaluate

In [16]:
for name, model in models.items():
    print(f"\n==== {name} ====")
    
    if "SMOTE" in name:
        model.fit(X_train_smote, y_train_smote)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))


==== Logistic Regression (SMOTE) ====
              precision    recall  f1-score   support

           0       0.66      0.52      0.58      1157
           1       0.36      0.51      0.42       618

    accuracy                           0.51      1775
   macro avg       0.51      0.51      0.50      1775
weighted avg       0.56      0.51      0.52      1775

ROC-AUC: 0.5062

==== Random Forest (class_weight) ====
              precision    recall  f1-score   support

           0       0.65      0.93      0.77      1157
           1       0.39      0.08      0.14       618

    accuracy                           0.63      1775
   macro avg       0.52      0.51      0.45      1775
weighted avg       0.56      0.63      0.55      1775

ROC-AUC: 0.5044

==== XGBoost (scale_pos_weight) ====
              precision    recall  f1-score   support

           0       0.67      0.65      0.66      1157
           1       0.38      0.39      0.39       618

    accuracy                     