In [1]:
import torch
import pandas as pd
import torch.nn as nn
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder, CountEncoder

# Data Preparation

In [None]:
df = pd.read_parquet('dataset/train-500k.parquet')
df['hour'] = pd.to_datetime(df['hour'], format='%y%m%d%H')
df.head()

In [None]:
columns = [
  'site_id', 'site_domain', 'site_category', 
  'app_id', 'app_domain', 'app_category', 
  'device_model'
]

low_cardinality_threshold = 100
moderate_cardinality_threshold = 1000

In [None]:
df_processed = df.copy()
df_processed['time_of_day'] = df_processed['hour'].dt.hour

for col in columns:
  unique_values_count = df[col].nunique()
  
  if unique_values_count < low_cardinality_threshold:
    # Low cardinality: Apply one-hot encoding
    df_processed = pd.concat([df_processed, pd.get_dummies(df[col], prefix=col)], axis=1)
    df_processed.drop(columns=[col], inplace=True)
  elif low_cardinality_threshold <= unique_values_count < moderate_cardinality_threshold:
    # Moderate cardinality: Apply frequency encoding
    encoder = CountEncoder()
    df_processed[col] = encoder.fit_transform(df[col])
  else:
    # High cardinality: Apply target encoding
    encoder = TargetEncoder()
    df_processed[col] = encoder.fit_transform(df[col], df['click'])

df_processed.head()

In [None]:
object_columns = df_processed.select_dtypes(include=['object']).columns

excluded_columns = ['id', 'click', 'hour'].append(object_columns)
feature_columns = df_processed.columns.difference(excluded_columns)

In [None]:
normalizer = Normalizer()
processed_df = normalizer.fit_transform(df_processed[feature_columns])
# X_train, X_test, y_train, y_test = train_test_split(processed_df, df['click'], test_size=0.2, random_state=42)

# Model Design

In [None]:
class CNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, num_classes):
    super(CNN, self).__init__()

    self.embedding = nn.Embedding(input_dim, embedding_dim)

    self.conv1 = nn.Conv1d(
      in_channels=embedding_dim, 
      out_channels=32, 
      kernel_size=3
    )

    self.pool1 = nn.MaxPool1d(kernel_size=2)

    self.conv2 = nn.Conv1d(
      in_channels=32, 
      out_channels=64, 
      kernel_size=3
    )

    self.pool2 = nn.MaxPool1d(kernel_size=2)

    self.flatten = nn.Flatten()

    self.fc1 = nn.Linear(in_features=64 * (input_dim // 2), out_features=128)
    self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

    self.relu = nn.ReLU()

  def forward(self, x):
    x = self.embedding(x)
    x = self.relu(self.pool1(self.conv1(x)))
    x = self.relu(self.pool2(self.conv2(x)))

    x = self.flatten(x)
    x = self.relu(self.fc1(x))
    x = self.fc2(x)
    return x