# CTR prediction for Amazon-Electronics using DIN

In [1]:
!pip install torch_rechub

Collecting torch_rechub
  Downloading torch-rechub-0.0.2.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting annoy>=1.17.0 (from torch_rechub)
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.7.0->torch_rechub)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.7.0->torch_rechub)
  Usin

In [2]:
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
print(torch.__version__, torch.cuda.is_available())
import torch_rechub
import pandas as pd
import numpy as np
import tqdm
import sklearn
torch.manual_seed(2024)

2.3.1+cu121 False


<torch._C.Generator at 0x7c7d54185450>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path='/content/drive/MyDrive/Colab Notebooks/amazon_electronics_sample.csv'
data = pd.read_csv(file_path)

In [6]:
data.head()

Unnamed: 0,user_id,item_id,time,cate_id
0,0,13179,1400457600,584
1,0,29247,1400457600,339
2,0,28326,1400457600,587
3,0,17993,1400457600,513
4,0,62275,1400457600,115


In [7]:
data.shape

(100, 4)

### feature engineering
- Dense Features: Numerical features, such as salary and age. In DIN, we do not use this type of feature.
  
- Sparse Features: Categorical features, such as gender and education level. Sparse features are directly encoded using a LabelEncoder, which maps the original categorical strings to numerical values. In the model, an embedding vector is generated for each value.
  
- Sequence Features: Sequential features, such as the sequence of item IDs a user has clicked on in the past or a sequence of stores visited.

In [8]:
from torch_rechub.utils.data import create_seq_features

train, val, test = create_seq_features(data, seq_feature_col=['item_id', 'cate_id'], drop_short=0)


In [9]:
train.shape, val.shape, test.shape

((134, 6), (32, 6), (32, 6))

In [10]:
train.head()

Unnamed: 0,user_id,history_item,history_cate,target_item,target_cate,label
0,15,"[86, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,26,0
1,16,"[16, 23, 65, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[26, 26, 32, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",33,26,1
2,3,"[54, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[24, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",67,51,1
3,14,"[29, 51, 52, 71, 92, 95, 0, 0, 0, 0, 0, 0, 0, ...","[36, 39, 34, 11, 28, 38, 0, 0, 0, 0, 0, 0, 0, ...",78,56,0
4,3,"[54, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[24, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",74,29,0


In [11]:
len(train.iloc[0]['history_item'])

50

In [12]:
len(train.iloc[0]['history_cate'])

50

### categorical features and sequence features in DIN model
- For categorical features, we want the model to input them into the embedding layer.
- For sequence features, we not only want the model to input them into the embedding layer but also to calculate target-attention scores.

We use `user_id`, `item_id`, and `item_cate` as the categorical features and use the user's historical sequence of `item_id` and `item_cate` as the sequence features.

In [13]:
from torch_rechub.basic.features import DenseFeature, SparseFeature, SequenceFeature

n_users, n_items, n_cates = data["user_id"].max(), data["item_id"].max(), data["cate_id"].max()


features = [SparseFeature("target_item", vocab_size=n_items + 2, embed_dim=64),
            SparseFeature("target_cate", vocab_size=n_cates + 2, embed_dim=64),
            SparseFeature("user_id", vocab_size=n_users + 2, embed_dim=64)]
target_features = features


history_features = [
    SequenceFeature("history_item", vocab_size=n_items + 2, embed_dim=64, pooling="concat", shared_with="target_item"),
    SequenceFeature("history_cate", vocab_size=n_cates + 2, embed_dim=64, pooling="concat", shared_with="target_cate")
]

In [14]:
from torch_rechub.utils.data import df_to_dict, DataGenerator

train = df_to_dict(train)
val = df_to_dict(val)
test = df_to_dict(test)

In [15]:
train_y, val_y, test_y = train["label"], val["label"], test["label"]

del train["label"]
del val["label"]
del test["label"]
train_x, val_x, test_x = train, val, test

In [17]:
dg = DataGenerator(train_x, train_y)
train_dataloader, val_dataloader, test_dataloader = dg.generate_dataloader(x_val=val_x, y_val=val_y, x_test=test_x, y_test=test_y, batch_size=16)

### train data with DIN

In [18]:
from torch_rechub.models.ranking import DIN
from torch_rechub.trainers import CTRTrainer

model = DIN(features=features, history_features=history_features, target_features=target_features, mlp_params={"dims": [256, 128]}, attention_mlp_params={"dims": [256, 128]})



In [19]:
ctr_trainer = CTRTrainer(model, optimizer_params={"lr": 1e-3, "weight_decay": 1e-3}, n_epoch=3, earlystop_patience=4, device='cpu', model_path='./')
ctr_trainer.fit(train_dataloader, val_dataloader)


auc = ctr_trainer.evaluate(ctr_trainer.model, test_dataloader)
print(f'test auc: {auc}')

epoch: 0


train: 100%|██████████| 9/9 [00:00<00:00, 15.48it/s]
validation: 100%|██████████| 2/2 [00:00<00:00,  6.69it/s]


epoch: 0 validation: auc: 0.3828125
epoch: 1


train: 100%|██████████| 9/9 [00:00<00:00, 19.09it/s]
validation: 100%|██████████| 2/2 [00:00<00:00,  7.28it/s]


epoch: 1 validation: auc: 0.38671875
epoch: 2


train: 100%|██████████| 9/9 [00:00<00:00, 18.82it/s]
validation: 100%|██████████| 2/2 [00:00<00:00,  6.55it/s]


epoch: 2 validation: auc: 0.3671875


validation: 100%|██████████| 2/2 [00:00<00:00,  6.65it/s]

test auc: 0.82421875



