In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pandas as pd
import numpy as np
import random

Setting Parameters which are used in the model.
These Parameters can be changed.

In [2]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 100
n_embd = 576
n_head = 6
n_layer = 6
vocab_size = 3953
dropout = 0.3
# ------------

Importing Dataset.
This dataset is a modified version of MovieLens-1M dataset.
Dataset contains sequence of movies watched in order of timestamp for all users.

In [3]:
data = pd.read_csv("/kaggle/input/dataset2/ratings1.csv")
data = data.drop("user_id",axis=1)

data

Unnamed: 0,interactions
0,"[3186, 1721, 1022, 1270, 2340, 1836, 3408, 120..."
1,"[1198, 1217, 1210, 2717, 1293, 2943, 1225, 119..."
2,"[593, 2858, 3534, 1968, 1961, 1431, 1266, 1378..."
3,"[1210, 1097, 3468, 3527, 480, 260, 1196, 1198,..."
4,"[2717, 919, 908, 356, 1250, 2188, 2858, 1127, ..."
...,...
6035,"[1721, 2376, 3438, 2428, 1883, 2492, 2827, 268..."
6036,"[1882, 2028, 1267, 702, 3508, 562, 3148, 858, ..."
6037,"[920, 3396, 1210, 2146, 356, 1387, 1079, 1148,..."
6038,"[111, 282, 2067, 930, 1230, 3133, 3022, 947, 3..."


Splitting Data into Train and Test Sets and converting them into numpy arrays because arrays are currently stored as strings in pandas dataframe which cannot be used in model.
All sequences are merged together into one long sequence too.

In [4]:

total_rows = len(data)
split_index = int(0.9 * total_rows)

train_data = data.iloc[:split_index]
test_data = data.iloc[split_index:]

# Optionally, reset the index for both DataFrames
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
array1 = train_data['interactions'].apply(eval)
array2 = test_data['interactions'].apply(eval)
long1 = np.concatenate(array1)
long2 = np.concatenate(array2)
print(len(long1),len(long2))
train_data = long1
test_data = long2



899469 100740


Get Batch function randomly picks sequences of length block_size and the number of sequences is batch_size.
Each batch is picked from a random position in sequence.

In [5]:
def get_batch(split):
    data = train_data if split == "train" else test_data
    data = torch.tensor(data)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

This is how 1 sequence looks.
Output sequence is (input+1) at each index.
This creates a target for model to predict since at each position, It shows the next predicted item at each position.
It can be seen as training examples equal to block_size because you can take any sequence at index (0-n) and its target would be at index (n) in output sequence.

In [6]:
x,y = get_batch('train')
x = x[0]
y = y[0]
print(x)
print(y)

tensor([3076, 2747,  539, 3189, 2738, 2407, 3129, 2245, 2779,  367, 1220, 2539,
        3614, 1958, 3052, 2469, 2870, 2671, 3263, 3501, 2109, 3448, 1083, 1409,
        2918, 3755, 3863, 2404, 1270, 1210,    1, 1267,  541, 1233, 1610,  589,
         110,  457, 2028,  480, 1356, 1480,  377, 2916, 2628,  733, 1573,  555,
        1370,  707, 1617, 3471, 1214, 3703,  260, 1196,  750, 1206, 1240, 2529,
        1199, 1374, 1097,   32, 2009, 1584, 1653, 2571,   21, 1372, 2353,  608,
         318, 2858,  593,  213, 1266,  590, 1208,  912, 1734,  598,  257,  563,
        1280, 1446, 3142, 1288, 1220, 1381, 1380, 2877, 1253, 1252,  444,  671,
        2498, 1860,  214,  913, 3334, 2951, 1304, 2922, 1201, 3681, 2943, 3745,
         904,  924,   50, 2762, 3784, 1175,   29, 2997, 3793, 2324, 3948, 2594,
        1580, 3175, 3863, 3555, 3300, 2605,  908, 1196, 2136, 2353, 1961, 1198,
        2763, 3911, 3897, 3948, 2858, 3481, 3160,  110, 2791, 1259, 3555, 2291,
        3624, 1580,  480,   70, 1544, 35

This function calculates loss and accuracy after specific number of epochs for validation.
It takes mean of loss of multiple test examples to calculate average loss.

In [7]:
@torch.no_grad()
def estimate_loss():
    out = {}
    out1 = {}
    model.eval()
    for split in ['train', 'val']:
        accuracies = torch.zeros(eval_iters)
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            loss,acc = model(X, Y)
            losses[k] = loss.item()
            accuracies[k] = acc.item()
        out[split] = losses.mean()
        out1[split] = accuracies.mean()
    model.train()
    return out,out1

This is the model
Model Diagram is given in Repository

In [8]:

class GPT2(nn.Module):
    def __init__(self, vocab_size, n_embd=768, n_head=12, n_layer=12, block_size=512):
        super(GPT2, self).__init__()
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.block_size = block_size

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding_table = nn.Embedding(block_size, n_embd)
        self.layers = nn.ModuleList([TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.ln_head = nn.Linear(n_embd, vocab_size)

    def forward(self, input_ids, targets=None):
        token_embeddings = self.token_embedding_table(input_ids)
        position_ids = torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
        position_embeddings = self.positional_embedding_table(position_ids)

        hidden_states = token_embeddings + position_embeddings

        for layer in self.layers:
            hidden_states = layer(hidden_states)

        logits = self.ln_head(hidden_states)

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
            acc = self.top_k_accuracy(logits, targets, k=10)
            return loss, acc
        else:
            return logits

    def generate(self, input_ids, max_len=20):
        for _ in range(max_len):
            logits = self.forward(input_ids)
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(-1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
        return input_ids

    def top_k_accuracy(self, logits, targets, k=10):
        _, indices = logits.topk(k, dim=-1)
        correct = torch.eq(indices, targets.unsqueeze(-1))
        correct_k = correct.any(dim=-1).float()
        return correct_k.mean()

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(n_embd, n_head)
        self.feed_forward = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([SingleHeadAttention(n_embd) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd * n_head, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class SingleHeadAttention(nn.Module):
    def __init__(self, n_embd):
        super(SingleHeadAttention, self).__init__()
        self.key = nn.Linear(n_embd, n_embd)
        self.query = nn.Linear(n_embd, n_embd)
        self.value = nn.Linear(n_embd, n_embd)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)
        attention_weights = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(x.size(-1)).float())
        attention_weights = attention_weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        attention_weights = F.softmax(attention_weights, dim=-1)
        attention_weights = self.dropout(attention_weights)
        out = torch.matmul(attention_weights, value)
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(n_embd, 4 * n_embd)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(4 * n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.dropout(x)
        return x




Here we create the model object and train it.
Acc metric is Hit@10 metric which checks whether the targer item is available in the top 10 probabilities predicted by model.

In [None]:


model = GPT2(vocab_size=vocab_size,n_embd=n_embd, n_head=n_head, n_layer=n_layer, block_size=block_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter%100 == 0:
      print(iter)

    if iter % eval_interval == 0:
        losses,acc = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        print(f"step {iter}: train acc {acc['train']:.4f}, val acc {acc['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    loss,acc = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0
step 0: train loss 8.6790, val loss 8.6808
step 0: train acc 0.0025, val acc 0.0026
100
step 100: train loss 7.3587, val loss 7.4054
step 100: train acc 0.0573, val acc 0.0516
200
step 200: train loss 6.4473, val loss 6.5857
step 200: train acc 0.1391, val acc 0.1189
300
step 300: train loss 6.1018, val loss 6.2869
step 300: train acc 0.1751, val acc 0.1460
400
step 400: train loss 5.8990, val loss 6.1356
step 400: train acc 0.1992, val acc 0.1622
500
step 500: train loss 5.7594, val loss 6.0575
step 500: train acc 0.2166, val acc 0.1699
600
step 600: train loss 5.6561, val loss 5.9679
step 600: train acc 0.2298, val acc 0.1799
700
step 700: train loss 5.5684, val loss 5.9250
step 700: train acc 0.2438, val acc 0.1850
800
step 800: train loss 5.4794, val loss 5.8876
step 800: train acc 0.2581, val acc 0.1914
900
step 900: train loss 5.4180, val loss 5.8636
step 900: train acc 0.2680, val acc 0.1952
1000
step 1000: train loss 5.3675, val loss 5.8295
step 1000: train acc 0.2766, val ac

This is how this model can be used to predict next items after training.

In [None]:
for i in range(5):
    print(i)
    xb, yb = get_batch('train')
    x = xb[0]
    y = yb[0]
    x = x.unsqueeze(0)
    y = y.unsqueeze(0)
    logits = model(x)
    predicted_sequence = torch.argmax(logits, dim=-1)  # Shape: (1, T)
    print(y)
    print(predicted_sequence)