Dataset link: https://www.kaggle.com/datasets/marius2303/ad-click-prediction-dataset

In [1]:
!gdown --id 1SHoeE6p0RQADbTPHXI7G7be26XeFayco

Downloading...
From: https://drive.google.com/uc?id=1SHoeE6p0RQADbTPHXI7G7be26XeFayco
To: /content/ad_click_dataset.csv
100% 465k/465k [00:00<00:00, 44.1MB/s]


In [2]:
import pandas as pd
df = pd.read_csv('/content/ad_click_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.2+ KB


Turn 1

# Here's how you can preprocess your dataset and use PyCaret for an AutoML approach to predict ad clicks:

Step 1: Install PyCaret If you haven't installed PyCaret, you can do so using pip

In [5]:
pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m966.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.2.tar.gz (165 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata 

Step 2: Load and Preprocess the Data

In [6]:
import pandas as pd
import numpy as np
from pycaret.classification import *

# Load the dataset
df = pd.read_csv('/content/ad_click_dataset.csv')

# Preprocessing
# Drop unnecessary columns
df = df.drop(columns=['id', 'full_name'])

# Handle missing values
# For categorical variables, we'll use 'Unknown' as a placeholder
categorical_features = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
for feature in categorical_features:
    df[feature] = df[feature].fillna('Unknown')

# For numerical variables, we'll use median
df['age'] = df['age'].fillna(df['age'].median())

# Encode categorical variables
# PyCaret will handle encoding, but for manual approach:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in categorical_features:
    df[column] = le.fit_transform(df[column])

# Normalize numerical features if needed (PyCaret will handle this, but for manual approach):
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['age'] = scaler.fit_transform(df[['age']])

# Setup PyCaret environment
clf_setup = setup(data=df, target='click', session_id=123,
                  normalize=True,
                  html=False, # Use html=False to avoid interactive prompts
                  categorical_features=categorical_features)

                    Description             Value
0                    Session id               123
1                        Target             click
2                   Target type            Binary
3           Original data shape        (10000, 7)
4        Transformed data shape       (10000, 25)
5   Transformed train set shape        (7000, 25)
6    Transformed test set shape        (3000, 25)
7              Numeric features                 1
8          Categorical features                 5
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13     Maximum one-hot encoding                25
14              Encoding method              None
15                    Normalize              True
16             Normalize method            zscore
17               Fold Generator   StratifiedKFold
18                  Fold Number                10


Step 3: Compare Models

In [7]:
# Compare models
best_model = compare_models()

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
xgboost         Extreme Gradient Boosting    0.7346  0.7659  0.9147  0.7391   
lightgbm  Light Gradient Boosting Machine    0.7134  0.7233  0.9400  0.7118   
dt               Decision Tree Classifier    0.7089  0.6922  0.8048  0.7611   
rf               Random Forest Classifier    0.7017  0.7252  0.8464  0.7350   
et                 Extra Trees Classifier    0.6801  0.7216  0.7982  0.7333   
gbc          Gradient Boosting Classifier    0.6769  0.6387  0.9789  0.6728   
ada                  Ada Boost Classifier    0.6610  0.5800  0.9824  0.6610   
lr                    Logistic Regression    0.6503  0.5569  0.9967  0.6508   
ridge                    Ridge Classifier    0.6501  0.5568  0.9976  0.6506   
dummy                    Dummy Classifier    0.6500  0.5000  1.0000  0.6500   
lda          Linear Discriminant Analysis    0.6499  0.5568  0.9960  0.6507   
knn                K Neighbors Classifier    0.6470 



Step 4: Finalize and Evaluate the Model

In [8]:
# Finalize the model
final_best = finalize_model(best_model)

# Evaluate the model
evaluate_model(final_best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Step 5: Save and Use the Model

In [9]:
# Save the model
save_model(final_best, 'ad_click_prediction_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None, include=['age'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['gender', 'device_type',
                                              'ad_position'...
                                importance_type=None,
                                interaction_constraints=None, learning_rate=Non