# Model Development

## Load and Inspect the Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
import warnings
warnings.filterwarnings("ignore", message=".*force_all_finite.*")

In [3]:
cleaned_dataset = pd.read_csv("data/cleaned_dataset.csv")

In [4]:
# make a copy of the dataset
dataset = cleaned_dataset.copy()

## Model development for Returning Customer (Classification Model)

In [5]:
# Define features (drop non-useful columns and target)
X = dataset.drop(columns=[
    'cus.id', 'date', 'cus._location',  
    'does_he/she_come_from_facebook_page'  # target variable
])
y = dataset['does_he/she_come_from_facebook_page']

In [6]:
X

Unnamed: 0,age,gender,sell_price,does_he/she_followed_our_page,did_he/she_buy_any_mobile_before,did_he/she_hear_of_our_shop_before,is_local,mobile_name_Galaxy M35 5G 8/128,mobile_name_Galaxy S24 Ultra 12/256,mobile_name_Moto G85 5G 8/128,...,mobile_name_Pixel 7a 8/128,mobile_name_Pixel 8 Pro 12/256,mobile_name_R-70 Turbo 5G 6/128,mobile_name_Redmi Note 12 Pro 8/128,mobile_name_Vivo T3x 5G 8/128,mobile_name_Vivo Y200 5G 6/128,mobile_name_iPhone 16 Pro 256GB,mobile_name_iPhone 16 Pro Max 1TB,mobile_name_iQOO Neo 9 Pro 5G 12/256,mobile_name_iQOO Z7 5G 6/128
0,49,0,17073.0,1,0,1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,44,1,15546.0,0,0,1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,45,1,26516.0,0,0,1,0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,46,1,21927.0,0,0,1,0,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,27,0,16718.0,0,0,1,1,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8866,34,1,21682.0,1,1,1,1,False,False,False,...,False,False,False,False,False,False,False,False,True,False
8867,48,1,26493.0,0,0,0,0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
8868,30,0,16184.0,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
8869,22,0,16663.0,0,0,1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
y

0       0
1       1
2       1
3       0
4       1
       ..
8866    1
8867    0
8868    0
8869    1
8870    1
Name: does_he/she_come_from_facebook_page, Length: 8871, dtype: int64

### Train/test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Standardize numeric features

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
X_train_scaled

array([[-1.47080707,  1.00197489, -0.02074348, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 1.13548562,  1.00197489,  2.10910314, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 1.23973733,  1.00197489, -0.07308004, ..., -0.24148435,
        -0.24181099, -0.24667015],
       ...,
       [ 1.13548562,  1.00197489, -0.18835158, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 0.40572367, -0.998029  , -0.35923755, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [-1.05380024,  1.00197489, -0.2151208 , ...,  4.14105513,
        -0.24181099, -0.24667015]])

In [11]:
X_test_scaled

array([[-1.47080707, -0.998029  , -0.03106875, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 0.09296855,  1.00197489, -0.66331404, ...,  4.14105513,
        -0.24181099, -0.24667015],
       [ 0.71847879,  1.00197489, -0.16529727, ..., -0.24148435,
        -0.24181099, -0.24667015],
       ...,
       [-0.11553487,  1.00197489,  1.41168299, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 1.65674416,  1.00197489, -0.46347362, ..., -0.24148435,
        -0.24181099, -0.24667015],
       [ 0.71847879, -0.998029  , -0.28570414, ..., -0.24148435,
        -0.24181099, -0.24667015]])

### Define and Train Models

In [12]:
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
        random_state=42
    ),
    "LightGBM": LGBMClassifier(
        is_unbalance=True,
        random_state=42
    ),
    "CatBoost": CatBoostClassifier(
        verbose=0,
        class_weights=[1.0, (y_train.value_counts()[0] / y_train.value_counts()[1])]
    )
}

### Train, predict and evaluate

In [13]:
for name, model in models.items():
    print(f"\n==== {name} ====")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))


==== Logistic Regression ====
              precision    recall  f1-score   support

           0       0.64      0.48      0.55      1157
           1       0.34      0.50      0.40       618

    accuracy                           0.49      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.54      0.49      0.50      1775

ROC-AUC: 0.4948

==== Random Forest ====
              precision    recall  f1-score   support

           0       0.65      0.81      0.72      1157
           1       0.35      0.19      0.25       618

    accuracy                           0.59      1775
   macro avg       0.50      0.50      0.49      1775
weighted avg       0.55      0.59      0.56      1775

ROC-AUC: 0.5054

==== XGBoost ====
              precision    recall  f1-score   support

           0       0.64      0.59      0.62      1157
           1       0.33      0.38      0.36       618

    accuracy                           0.52      1775
   macro avg       0.