# CTR prediction for Criteo Ad Dataset using DeepFM

In [1]:
!pip install torch_rechub

Collecting torch_rechub
  Downloading torch-rechub-0.0.2.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting annoy>=1.17.0 (from torch_rechub)
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.7.0->torch_rechub)
  Usin

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import torch
from torch_rechub.models.ranking import WideDeep, DeepFM, DCN
from torch_rechub.trainers import CTRTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature
from torch_rechub.utils.data import DataGenerator
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
torch.manual_seed(2024)

<torch._C.Generator at 0x7cda01f3ae70>

### dataset
This dataset is an online advertising dataset released by Criteo Labs. It contains millions of click feedback records from displayed ads, which can be used as a benchmark for Click-Through Rate (CTR) prediction.

The dataset includes 40 features, with the first column serving as the label where a value of 1 indicates a clicked ad, and 0 indicates an unclicked ad. The other features consist of 13 dense features and 26 sparse features.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_path='/content/drive/MyDrive/Colab Notebooks/criteo_sample.csv'


In [6]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de
1,0,0.0,-1,63.0,40.0,1470.0,61.0,4.0,37.0,46.0,...,e5ba7672,d3303ea5,21ddcdc9,b1252a9d,7633c7c8,,32c7478e,17f458f7,001f3601,71236095
2,0,0.0,370,4.0,1.0,1787.0,65.0,14.0,25.0,489.0,...,3486227d,642f2610,55dd3565,b1252a9d,5c8dc711,,423fab69,45ab94c8,2bf691b1,c84c4aec
3,1,19.0,10,30.0,10.0,1.0,3.0,33.0,47.0,126.0,...,e5ba7672,a78bd508,21ddcdc9,5840adea,c2a93b37,,32c7478e,1793a828,e8b83407,2fede552
4,0,0.0,0,36.0,22.0,4684.0,217.0,9.0,35.0,135.0,...,e5ba7672,7ce63c71,,,af5dc647,,dbb486d7,1793a828,,


In [7]:
data.shape

(115, 40)

### feature engineering

- Dense Features: Numerical features, such as salary and age. Two operations are performed on dense features:
  - MinMaxScaler normalization: This scales the values to be within the range [0,1].
  - Discretization: The features are converted into new sparse features.
  
- Sparse Features: Categorical features, such as gender and education level. Sparse features are directly encoded using a LabelEncoder, which maps the original categorical strings to numerical values. In the model, an embedding vector is generated for each value.

In [8]:
# separate dense and sparse features
dense_cols= [f for f in data.columns.tolist() if f[0] == "I"]
sparse_cols = [f for f in data.columns.tolist() if f[0] == "C"]

# fill in missingness
data[dense_cols] = data[dense_cols].fillna(0)
data[sparse_cols] = data[sparse_cols].fillna('-996')

In [9]:
len(dense_cols),len(sparse_cols)

(13, 26)

In [10]:
def convert_numeric_feature(val):
    v = int(val)
    if v > 2:
        return int(np.log(v)**2)
    else:
        return v - 2

for col in tqdm(dense_cols):
    sparse_cols.append(col + "_sparse")
    data[col + "_sparse"] = data[col].apply(lambda x: convert_numeric_feature(x))



100%|██████████| 13/13 [00:00<00:00, 1026.29it/s]


In [11]:
# for dense features
scaler = MinMaxScaler()
data[dense_cols] = scaler.fit_transform(data[dense_cols])

In [12]:
# for sparse features
for col in tqdm(sparse_cols):
    lbe = LabelEncoder()
    data[col] = lbe.fit_transform(data[col])

100%|██████████| 39/39 [00:00<00:00, 3144.10it/s]


In [13]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,I4_sparse,I5_sparse,I6_sparse,I7_sparse,I8_sparse,I9_sparse,I10_sparse,I11_sparse,I12_sparse,I13_sparse
0,0,0.0,0.000349,0.036945,0.310345,0.003922,0.137464,0.023256,0.653061,0.046077,...,12,34,24,3,14,15,0,1,0,12
1,0,0.0,0.0,0.02238,0.45977,0.002898,0.059051,0.023256,0.755102,0.057285,...,15,30,17,3,15,16,0,1,0,15
2,0,0.0,0.129449,0.001421,0.011494,0.003522,0.062924,0.081395,0.510204,0.608966,...,1,33,18,8,12,34,0,5,0,12
3,1,0.513514,0.003838,0.010657,0.114943,2e-06,0.002904,0.19186,0.959184,0.156912,...,7,1,3,14,16,24,3,4,0,2
4,0,0.0,0.000349,0.012789,0.252874,0.009233,0.210068,0.052326,0.714286,0.16812,...,11,43,27,6,14,25,0,1,0,16


In [14]:
len(dense_cols), len(sparse_cols)

(13, 39)

In [15]:
dense_features = [DenseFeature(feature_name) for feature_name in dense_cols]
sparse_features = [SparseFeature(feature_name, vocab_size=data[feature_name].nunique(), embed_dim=16) for feature_name in sparse_cols]
y = data["label"]
del data["label"]
x = data

In [16]:
x.shape,y.shape

((115, 52), (115,))

In [17]:
#split_ratio=[0.7,0.1] for train and val
dg = DataGenerator(x, y)
train_dataloader, val_dataloader, test_dataloader = dg.generate_dataloader(split_ratio=[0.7, 0.1], batch_size=256, num_workers=8)

the samples of train : val : test are  80 : 11 : 24


### train data using DeepFM

In [18]:
from torch_rechub.models.ranking import DeepFM
from torch_rechub.trainers import CTRTrainer

model = DeepFM(
        deep_features=dense_features+sparse_features,
        fm_features=sparse_features,
        mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"},
    )

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": 1e-4, "weight_decay": 1e-5},
    n_epoch=1,
    earlystop_patience=3,
    device='cpu',
    model_path='./',
)


In [19]:
ctr_trainer.fit(train_dataloader, val_dataloader)

auc = ctr_trainer.evaluate(ctr_trainer.model, test_dataloader)
print(f'test auc: {auc}')

epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00,  1.41it/s]
validation: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]


epoch: 0 validation: auc: 0.6


validation: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]

test auc: 0.42105263157894735





### train data using WideDeep

In [20]:

model = WideDeep(wide_features=dense_features, deep_features=sparse_features, mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"})

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": 1e-4, "weight_decay": 1e-5},
    n_epoch=1,
    earlystop_patience=3,
    device='cpu',
    model_path='./',
)


In [21]:
ctr_trainer.fit(train_dataloader, val_dataloader)

auc = ctr_trainer.evaluate(ctr_trainer.model, test_dataloader)
print(f'test auc: {auc}')

epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
validation: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


epoch: 0 validation: auc: 0.4666666666666667


validation: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]

test auc: 0.24210526315789477





### train data using DCN

In [22]:
model = DCN(features=dense_features + sparse_features, n_cross_layers=3, mlp_params={"dims": [256, 128]})

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": 1e-4, "weight_decay": 1e-5},
    n_epoch=1,
    earlystop_patience=3,
    device='cpu',
    model_path='./',
)


In [23]:
ctr_trainer.fit(train_dataloader, val_dataloader)

auc = ctr_trainer.evaluate(ctr_trainer.model, test_dataloader)
print(f'test auc: {auc}')

epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
validation: 100%|██████████| 1/1 [00:00<00:00,  2.96it/s]


epoch: 0 validation: auc: 0.5333333333333333


validation: 100%|██████████| 1/1 [00:00<00:00,  2.83it/s]

test auc: 0.49473684210526314





### self-design model

In [24]:
from torch_rechub.basic.layers import FM, MLP, LR, EmbeddingLayer

In [25]:
class MyDeepFM(torch.nn.Module):
  # deep_features,fm_features
  # mlp_params
  def __init__(self, deep_features, fm_features, mlp_params):
    super().__init__()
    self.deep_features = deep_features
    self.fm_features = fm_features
    self.deep_dims = sum([fea.embed_dim for fea in deep_features])
    self.fm_dims = sum([fea.embed_dim for fea in fm_features])
    #  first-order feature interaction
    self.linear = LR(self.fm_dims)
    #  second-order feature interaction
    self.fm = FM(reduce_sum=True)
    # embedding representations of features
    self.embedding = EmbeddingLayer(deep_features + fm_features)
    self.mlp = MLP(self.deep_dims, **mlp_params)

  def forward(self, x):
    input_deep = self.embedding(x, self.deep_features, squeeze_dim=True)  #[batch_size, deep_dims]
    input_fm = self.embedding(x, self.fm_features, squeeze_dim=False)  #[batch_size, num_fields, embed_dim]

    y_linear = self.linear(input_fm.flatten(start_dim=1))
    y_fm = self.fm(input_fm)
    y_deep = self.mlp(input_deep)  #[batch_size, 1]

    y = y_linear + y_fm + y_deep

    return torch.sigmoid(y.squeeze(1))

In [26]:
model = MyDeepFM(
        deep_features=dense_features+sparse_features,
        fm_features=sparse_features,
        mlp_params={"dims": [256, 128], "dropout": 0.2, "activation": "relu"},
    )

ctr_trainer = CTRTrainer(
    model,
    optimizer_params={"lr": 1e-4, "weight_decay": 1e-5},
    n_epoch=1,
    earlystop_patience=3,
    device='cpu',
    model_path='./',
)


In [27]:
ctr_trainer.fit(train_dataloader, val_dataloader)

auc = ctr_trainer.evaluate(ctr_trainer.model, test_dataloader)
print(f'test auc: {auc}')

epoch: 0


train: 100%|██████████| 1/1 [00:00<00:00,  2.56it/s]
validation: 100%|██████████| 1/1 [00:00<00:00,  2.61it/s]


epoch: 0 validation: auc: 0.7000000000000001


validation: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]

test auc: 0.43157894736842106



