In [1]:
import torch
import pandas as pd
import torch.nn as nn
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder, CountEncoder

# Data Preparation

In [2]:
df = pd.read_parquet('dataset/train-500k.parquet')
df['hour'] = pd.to_datetime(df['hour'], format='%y%m%d%H')
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,2014-10-21,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [3]:
columns = [
  'site_id', 'site_domain', 'site_category', 
  'app_id', 'app_domain', 'app_category', 
  'device_model'
]

low_cardinality_threshold = 100
moderate_cardinality_threshold = 1000

In [4]:
df_processed = df.copy()
df_processed['time_of_day'] = df_processed['hour'].dt.hour

for col in columns:
  unique_values_count = df[col].nunique()
  
  if unique_values_count < low_cardinality_threshold:
    # Low cardinality: Apply one-hot encoding
    df_processed = pd.concat([df_processed, pd.get_dummies(df[col], prefix=col)], axis=1)
    df_processed.drop(columns=[col], inplace=True)
  elif low_cardinality_threshold <= unique_values_count < moderate_cardinality_threshold:
    # Moderate cardinality: Apply frequency encoding
    encoder = CountEncoder()
    df_processed[col] = encoder.fit_transform(df[col])
  else:
    # High cardinality: Apply target encoding
    encoder = TargetEncoder()
    df_processed[col] = encoder.fit_transform(df[col], df['click'])

df_processed.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,app_id,app_domain,device_id,...,app_category_8ded1f7a,app_category_8df2e842,app_category_a3c42688,app_category_a7fd01ec,app_category_a86a3e89,app_category_cef3e649,app_category_d1327cf5,app_category_dc97ec06,app_category_f95efa07,app_category_fc6fa53d
0,1.000009e+18,0,2014-10-21,1005,0,0.211825,0.211825,0.196176,366566,a99f214a,...,False,False,False,False,False,False,False,False,False,False
1,1.000017e+19,0,2014-10-21,1005,0,0.211825,0.211825,0.196176,366566,a99f214a,...,False,False,False,False,False,False,False,False,False,False
2,1.000037e+19,0,2014-10-21,1005,0,0.211825,0.211825,0.196176,366566,a99f214a,...,False,False,False,False,False,False,False,False,False,False
3,1.000064e+19,0,2014-10-21,1005,0,0.211825,0.211825,0.196176,366566,a99f214a,...,False,False,False,False,False,False,False,False,False,False
4,1.000068e+19,0,2014-10-21,1005,1,0.035931,0.035931,0.196176,366566,a99f214a,...,False,False,False,False,False,False,False,False,False,False


In [5]:
df_processed.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'app_id', 'app_domain', 'device_id', 'device_ip', 'device_model',
       'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21', 'time_of_day', 'site_category_0569f928',
       'site_category_110ab22d', 'site_category_28905ebd',
       'site_category_335d28a8', 'site_category_3e814130',
       'site_category_42a36e14', 'site_category_50e219e0',
       'site_category_5378d028', 'site_category_70fb0e29',
       'site_category_72722551', 'site_category_75fa27f6',
       'site_category_76b2941d', 'site_category_8fd0aea4',
       'site_category_9ccfa2ea', 'site_category_a818d37a',
       'site_category_bcf865d9', 'site_category_c0dd3be3',
       'site_category_dedf689d', 'site_category_e787de0e',
       'site_category_f028772b', 'site_category_f66779e6',
       'app_category_07d7df22', 'app_category_09481d60',
       'app_category_0f2161f8', 'app_category_0f9a328c',
 

In [6]:
ignore_columns = ['id', 'click', 'hour']
object_columns = df_processed.select_dtypes(include=['object']).columns
boolean_columns = df_processed.select_dtypes(include=['bool']).columns
feature_columns = df_processed.columns.difference(ignore_columns).difference(object_columns).difference(boolean_columns)

In [7]:
normalizer = Normalizer()
processed_df = normalizer.fit_transform(df_processed[feature_columns])

In [8]:
X = torch.tensor(processed_df, dtype=torch.float32)
X = torch.cat([X, torch.tensor(df_processed[boolean_columns].values)], dim=1)
y = torch.tensor(df['click'].values, dtype=torch.float32)

# Convolutional Neural Network

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()

        self.conv1 = nn.Conv1d(
            in_channels=1,
            out_channels=32,
            kernel_size=3
        )

        self.pool1 = nn.MaxPool1d(kernel_size=2)

        self.conv2 = nn.Conv1d(
            in_channels=32,
            out_channels=64,
            kernel_size=3
        )

        self.pool2 = nn.MaxPool1d(kernel_size=2)

        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(in_features=832, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.pool1(self.conv1(x)))
        x = self.relu(self.pool2(self.conv2(x)))

        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train(model, criterion, optimizer, X_train, y_train, epochs):
    model.to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}')

def predict(model, data):
    model.eval()
    with torch.no_grad():
        output = model(data)
        predicted_clicks = torch.round(torch.sigmoid(output)).int()
    return predicted_clicks

num_classes = 2

model = CNN(num_classes)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train to torch.long data type
y_train = torch.tensor(y_train, dtype=torch.long)

X_train = X_train.to(device)
y_train = y_train.to(device)

train(
    model,
    nn.CrossEntropyLoss(),
    torch.optim.Adam(model.parameters(), lr=0.01),
    X_train.unsqueeze(1),
    y_train,
    epochs=20
)

predicted_clicks = predict(model, X_train.unsqueeze(1))

  y_train = torch.tensor(y_train, dtype=torch.long)
  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/20], Loss: 0.7097817063331604
Epoch [2/20], Loss: 0.7454588413238525
Epoch [3/20], Loss: 0.44786369800567627
Epoch [4/20], Loss: 0.5220491886138916
Epoch [5/20], Loss: 0.5174136161804199
Epoch [6/20], Loss: 0.4738650321960449
Epoch [7/20], Loss: 0.4460206925868988
Epoch [8/20], Loss: 0.46646779775619507
Epoch [9/20], Loss: 0.4787917137145996
Epoch [10/20], Loss: 0.4601932466030121
Epoch [11/20], Loss: 0.44556131958961487
Epoch [12/20], Loss: 0.45154547691345215
Epoch [13/20], Loss: 0.46180257201194763
Epoch [14/20], Loss: 0.46156105399131775
Epoch [15/20], Loss: 0.45236846804618835
Epoch [16/20], Loss: 0.44484180212020874
Epoch [17/20], Loss: 0.4475390613079071
Epoch [18/20], Loss: 0.4537905156612396
Epoch [19/20], Loss: 0.45264220237731934
Epoch [20/20], Loss: 0.44640564918518066
