# Introduction

# Import libraries and datasets

In [52]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn 
import torch.nn.functional as function
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from torchmetrics import Accuracy
from sklearn.model_selection import train_test_split

In [53]:
# Load dataset
ctr_data = pd.read_csv('avazu.csv')
print(ctr_data.shape)

(100000, 16)


In [54]:
print(ctr_data.columns)

Index(['Unnamed: 0', 'click', 'hour', 'C1', 'banner_pos', 'device_type',
       'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
       'C21', 'device_model_int'],
      dtype='object')


In [55]:
# Drop column 'Unnamed: 0'
ctr_data = ctr_data.drop(columns=['Unnamed: 0'])

In [56]:
#Check missing values or null or nan values
print(ctr_data.isnull().sum())

click               0
hour                0
C1                  0
banner_pos          0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
device_model_int    0
dtype: int64


In [57]:
print(ctr_data.head(5))

   click      hour    C1  banner_pos  device_type  device_conn_type    C14  \
0      0  14102100  1005           0            1                 2  15706   
1      0  14102100  1005           0            1                 0  15704   
2      0  14102100  1005           0            1                 0  15704   
3      0  14102100  1005           0            1                 0  15706   
4      0  14102100  1005           1            1                 0  18993   

   C15  C16   C17  C18  C19     C20  C21     device_model_int  
0  320   50  1722    0   35      -1   79 -4536565594672005814  
1  320   50  1722    0   35  100084   79   -80052322344914806  
2  320   50  1722    0   35  100084   79 -3130634972019121531  
3  320   50  1722    0   35  100084   79 -8587292268327570678  
4  320   50  2161    0   35      -1  157 -7699311560514132401  


In [58]:
print(ctr_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   click             100000 non-null  int64
 1   hour              100000 non-null  int64
 2   C1                100000 non-null  int64
 3   banner_pos        100000 non-null  int64
 4   device_type       100000 non-null  int64
 5   device_conn_type  100000 non-null  int64
 6   C14               100000 non-null  int64
 7   C15               100000 non-null  int64
 8   C16               100000 non-null  int64
 9   C17               100000 non-null  int64
 10  C18               100000 non-null  int64
 11  C19               100000 non-null  int64
 12  C20               100000 non-null  int64
 13  C21               100000 non-null  int64
 14  device_model_int  100000 non-null  int64
dtypes: int64(15)
memory usage: 11.4 MB
None


In [59]:
n_unique_values = ctr_data.nunique()
print(n_unique_values)

click                  2
hour                   1
C1                     6
banner_pos             5
device_type            4
device_conn_type       4
C14                  420
C15                    5
C16                    6
C17                  128
C18                    4
C19                   37
C20                  137
C21                   29
device_model_int    2473
dtype: int64


In [60]:
# value counts of target column
print(ctr_data['click'].value_counts())

click
0    82510
1    17490
Name: count, dtype: int64


# Load and Scale data

In [61]:
features = ctr_data.drop(columns=['click'])
target = ctr_data['click']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(80000, 14) (80000,)
(20000, 14) (20000,)


In [63]:
#2. Scaling features:
scaler = StandardScaler()

# Fit the scaler to the data and transform it
scaled_X_train = scaler.fit_transform(X_train)

In [64]:
#3. Converting to Pytorch Tensors - TensorDataset

#Instantiate dataset class
dataset = TensorDataset(torch.tensor(scaled_X_train, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32).reshape(-1,1))

#Access an individual sample
input_sample, label_sample = dataset[0]
print('input sample:', input_sample)   
print('label sample:', label_sample)

input sample: tensor([ 0.0000, -0.0324, -0.4937, -0.0962, -0.3146,  0.8222, -1.5181,  5.2832,
         0.9291,  0.9894, -0.3796,  1.2805, -1.4432,  0.8140])
label sample: tensor([1.])


In [71]:
#4. DataLoader
batch_size = 32
suffle = True

# Create a DataLoader
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=suffle)

In [72]:
#5. Transform data and convert tensors with test data

# Fit the scaler to the data and transform it
scaled_test_features = scaler.transform(X_test) #transform not fit 

#Instantiate dataset class
test_dataset = TensorDataset(torch.tensor(scaled_test_features, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.float32).reshape(-1,1))

#DataLoader
batch_size = 32
suffle = True

# Create a DataLoader
val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=suffle)

# Define the Neural Network Model

In [73]:
# Create binary classification model
num_features = X_train.shape[1]

model = nn.Sequential(
    nn.Linear(num_features, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
    nn.Sigmoid()
)

In [74]:
print(model)

Sequential(
  (0): Linear(in_features=14, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=16, bias=True)
  (3): ReLU()
  (4): Linear(in_features=16, out_features=1, bias=True)
  (5): Sigmoid()
)


In [75]:
# Choose loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-4)


# Train the model

In [76]:
#1. Train the model
num_epochs = 10  # Number of epochs

for epoch in range(num_epochs):
    model.train()  # Training mode
    training_loss = 0.0

    for feature, target in data_loader:
        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        pred = model(feature)

        # Compute loss
        loss = criterion(pred, target)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        # Accumulate loss
        training_loss += loss.item()

    # Average loss for the epoch
    epoch_loss = training_loss / len(data_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/10, Loss: 0.5584
Epoch 2/10, Loss: 0.4767
Epoch 3/10, Loss: 0.4621
Epoch 4/10, Loss: 0.4551
Epoch 5/10, Loss: 0.4503
Epoch 6/10, Loss: 0.4469
Epoch 7/10, Loss: 0.4443
Epoch 8/10, Loss: 0.4424
Epoch 9/10, Loss: 0.4409
Epoch 10/10, Loss: 0.4398


# Evaluate the model

In [77]:
#2. Calculating validation loss

validation_loss = 0.0
model.eval() #Put model to evaluation mode

with torch.no_grad():
    for inputs, labels in val_loader:
        #Run the forward pass
        outputs = model(inputs)
        #Calculate the loss
        loss = criterion(outputs, labels)
        validation_loss += loss.item()

epoch_loss = validation_loss / len(val_loader)
model.train()

Sequential(
  (0): Linear(in_features=14, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=16, bias=True)
  (3): ReLU()
  (4): Linear(in_features=16, out_features=1, bias=True)
  (5): Sigmoid()
)

In [78]:
print(f"Validation Loss: {epoch_loss:.4f}")

Validation Loss: 0.4386


# Evaluation metrics

In [80]:
#3. Evaluation Accuracy with TorchMetric
from torchmetrics import Accuracy

acc = Accuracy(task='binary')

model.eval()

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()  # Convert probabilities to binary predictions
        acc.update(predictions, labels.int())

accuracy = acc.compute()
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8251
