In [4]:
import torch
import torch.nn as nn
import math
import torch.optim as optim
import pandas as pd
from collections import defaultdict
from torch.utils.data import DataLoader, Dataset


device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

mps


In [2]:
# import sys
# print(sys.executable)

# !{sys.executable} -m pip install pandas

In [6]:
# NUM_LAYERS = 4
# EMBED_DIM = 128
# NUM_HEADS = 8
# FF_DIM = 512
# SRC_VOCAB_SIZE = 10000
# TGT_VOCAB_SIZE = 10000
# MAX_LEN = 100 

# DROPOUT = 0.1

NUM_LAYERS = 2
EMBED_DIM = 64
NUM_HEADS = 4
FF_DIM = 126
SRC_VOCAB_SIZE = 100
TGT_VOCAB_SIZE = 100
MAX_LEN = 50
DROPOUT = 0.1


In [7]:
# qkv: Projects input to queries (Q), keys (K), and values (V).
# attention_scores: Compute attention weights.
# fc_out: Merges multi-head outputs.

class MultiHeadSelfAttention(nn.Module): #We are doing multi-head self attention implementation, inheriting from nn.Module
    #It calculates relationships between tokens in the sequence.
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == embed_dim
        
        self.qkv = nn.Linear(embed_dim, embed_dim * 3) #Fully connected linear layer that maps the input embeddings to three outputs: queries (Q), keys (K), and values (V)
        self.fc_out = nn.Linear(embed_dim, embed_dim) # a linear layer that projects the final multi-head attention output back to the original embedding dimension
        # After attention computation, we need to merge the contributions of all heads into a single, meaningful representation
    
    def forward(self, x):
        batch_size, seq_length, embed_dim = x.shape #x is the input tensor of shape [batch_size, seq_length, embed_dim]
        qkv = self.qkv(x)  # Shape: [batch, seq_len, embed_dim * 3]
        #the embedding values are typically transformed using a linear layer to create separate components for Query (Q), Key (K), and Value (V)
        qkv = qkv.reshape(batch_size, seq_length, 3, self.num_heads, self.head_dim)
        q, k, v = qkv.unbind(dim = 2)

        attention_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim) # dot product of queries and keys, followed by scaling, scales down the values to avoid excessively large scores due to the dimensionality.
        #Transposing helps align the dimensions to correctly combine attention probabilities with the Value vectors
        attention_probs = attention_scores.softmax(dim = 1) # transforming the scores into probabilities

        out = (attention_probs @ v).transpose(1, 2).reshape(batch_size, seq_length, embed_dim)
        return self.fc_out(out)
        



In [8]:
# embed_dim: Dimension of the input features.
# ff_dim: Dimension of the hidden layer in the feedforward network.
# dropout: Dropout rate, used to prevent overfitting.

class FeedForward(nn.Module): #Applies non-linear transformations to enhance token representations, more accurate and contextually rich

    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, ff_dim) #temporarily expands the dimensionality of the token embeddings, 
        self.fc2 = nn.Linear(ff_dim, embed_dim) #allows the model to learn more complex patterns for each token before projecting them back to the original dimension
        self.dropout = nn.Dropout(dropout) #randomly zeroing out some activations during training, so the model does not rely on specific features
        self.relu = nn.ReLU() #It introduces non-linearity and helps the model learn complex patterns.
# Processes the attention output.
# Adds non-linearity and depth to the model.

    def forward(self, x): #defines how the input data flows through the layers of the network.
        return self.fc2(self.dropout(self.relu(self.fc1(x))))

In [9]:
class TransformerEncoderLayer(nn.Module): #Combines attention and feed-forward layers with normalization and residual connections.
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = FeedForward(embed_dim, ff_dim, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout) #randomly zero out some activations during training

    def forward(self, x):
        attention_out = self.attention(x)
        x = self.norm1(x + attention_out) #normalizes the output after self-attention
        ff_out = self.ff(x)
        return self.norm2(x + ff_out) #normalizes the output after the feed-forward layer.


In [10]:
# num_layers: The number of layers in the encoder.
# embed_dim: Dimension of the embeddings.
# num_heads: Number of attention heads in multi-head self-attention.
# ff_dim: Dimension of the feed-forward layer.
# vocab_size: Size of the vocabulary (number of unique tokens in the dataset).
# max_len: Maximum sequence length for positional encoding.
# dropout: Dropout rate for regularization, default is 0.1.


class TransformerEncoder(nn.Module): #Encodes the input sequence into rich contextualized representations
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) #Turn input token indices to embeddings vectors of embed_dim dimension
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim)) #Positional encoding allows the model to learn the position of tokens in the sequence
        self.layers = nn.ModuleList([     #creates multiple instances of the TransformerEncoderLayer with the specified dimensions for attention heads, feed-forward dimensions, and dropout.
            TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, x):
        seq_length = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
        for layer in self.layers: #More layers lead to richer contextual representations, allowing the model to capture complex relationships between tokens.
            x = layer(x)
        return x


In [11]:
class Transformer(nn.Module): #complete transformer that combines the encoder and output projection.
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, src_vocab_size, tgt_vocab_size, max_len, dropout=0.1):
        super().__init__()
        self.encoder = TransformerEncoder(num_layers, embed_dim, num_heads, ff_dim, src_vocab_size, max_len, dropout)
        self.fc_out = nn.Linear(embed_dim, tgt_vocab_size)

    def forward(self, src):
        encoded = self.encoder(src)
        logits = self.fc_out(encoded)
        return logits


In [12]:
# Load the dataset from the CSV file
csv_file_name = "en-fr 2.csv"  # Replace with your CSV file path
data = pd.read_csv(csv_file_name)

# Extract source (English) and target (French) sentences
src_sentences = data.iloc[:, 0].fillna("").astype(str).tolist()  # English sentences
tgt_sentences = data.iloc[:, 1].fillna("").astype(str).tolist()  # French translations

# Create dynamic vocabularies
src_vocab = defaultdict(lambda: len(src_vocab))  # Source vocabulary
tgt_vocab = defaultdict(lambda: len(tgt_vocab))  # Target vocabulary

src_vocab["<UNK>"] = 1  # Default index for unknown tokens in source vocabulary
tgt_vocab["<UNK>"] = 1 

# Build vocabularies from the dataset
for sentence in src_sentences:
    for token in sentence.split():
        _ = src_vocab[token]  # Add token to source vocab

for sentence in tgt_sentences:
    for token in sentence.split():
        _ = tgt_vocab[token]  # Add token to target vocab

# Convert vocabularies to regular dictionaries
src_vocab = dict(src_vocab)
tgt_vocab = dict(tgt_vocab)

# Define the dataset class
class SimpleDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences, src_vocab, tgt_vocab, max_len=50):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, index):
        src_sentence = self.src_sentences[index]
        tgt_sentence = self.tgt_sentences[index]

        # Tokenize sentences and pad with zeros to max_len
        src_tokens = [self.src_vocab.get(token, 1) for token in src_sentence.split()]  # Default to 1 for unknown tokens
        tgt_tokens = [self.tgt_vocab.get(token, 1) for token in tgt_sentence.split()]  # Default to 1 for unknown tokens

        # Pad sequences to max_len
        src_tokens = src_tokens + [0] * (self.max_len - len(src_tokens))
        tgt_tokens = tgt_tokens + [0] * (self.max_len - len(tgt_tokens))

        return torch.tensor(src_tokens[:self.max_len]), torch.tensor(tgt_tokens[:self.max_len])

# Create the dataset
dataset = SimpleDataset(src_sentences, tgt_sentences, src_vocab, tgt_vocab)

# Check dataset and vocab sizes
print(f"Source Vocab Size: {len(src_vocab)}")
print(f"Target Vocab Size: {len(tgt_vocab)}")
print(f"Sample from dataset: {dataset[0]}")

Source Vocab Size: 6228625
Target Vocab Size: 6079061
Sample from dataset: (tensor([ 1,  2,  3,  1,  4,  3,  5,  6,  7,  3,  8,  9, 10, 11,  3, 12,  3, 13,
         3, 14,  3, 15,  3, 16,  3, 17, 18,  3, 19,  3, 20, 21, 22, 23, 24, 11,
        25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 1,  2,  3,  4,  5,  6,  1,  2,  3,  7,  8,  6,  9, 10,  6, 11, 12, 13,
        14, 15, 16,  6, 17,  6, 18,  6, 19,  6, 20,  6, 21,  6, 22, 23,  6, 24,
         6, 25, 26, 27, 14, 28,  0,  0,  0,  0,  0,  0,  0,  0]))


In [13]:
from torch.utils.data import DataLoader

#Training on batches allows the model to process data more efficiently.This helps in parallelizing computations, reducing training time, and managing memory usage.

# Hyperparameters
BATCH_SIZE = 16 #DataLoader will return batches of 2 samples at a time

train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)




In [14]:
# Subset the dataset to use only the first 100,000 samples per epoch
subset_size = 1000  # Use only the first 100,000 samples per epoch
subset = torch.utils.data.Subset(dataset, list(range(subset_size)))
train_dataloader = DataLoader(subset, batch_size=16, shuffle=True)

# Dynamically adjust vocab sizes
SRC_VOCAB_SIZE = max(SRC_VOCAB_SIZE, max([src.max().item() for src, _ in dataset]) + 1)
TGT_VOCAB_SIZE = max(TGT_VOCAB_SIZE, max([tgt.max().item() for _, tgt in dataset]) + 1)

# Instantiate model
model = Transformer(NUM_LAYERS, EMBED_DIM, NUM_HEADS, FF_DIM, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, MAX_LEN, DROPOUT)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignoring padding tokens (0), computes the cross-entropy between predicted logits and actual target indices
optimizer = optim.Adam(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1

for epoch in range(num_epochs):
    total_loss = 0
    print("In epoch")
    for src, tgt in train_dataloader:
        optimizer.zero_grad()
        print("Now in forward pass")
        # Forward pass
        output = model(src)  # generates predictions
        loss = criterion(output.view(-1, TGT_VOCAB_SIZE), tgt.view(-1))  # flattened output is then compared with the true tokens to calculate the loss
        print("backward pass")
        # Backward pass
        loss.backward()  # computes the gradients of the model parameters with respect to the loss
        optimizer.step()  # updates the model parameters based on the gradients calculated.

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")



In epoch
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now in forward pass
backward pass
Now i

In [15]:
torch.save(model.state_dict(), "transformer_model.pth")
print("Model saved successfully!")

Model saved successfully!


In [16]:
# Validation loop (separate chunk)
validation_size = 500  # Use the next 500 samples for validation
validation_subset = torch.utils.data.Subset(dataset, list(range(subset_size, subset_size + validation_size)))
validation_dataloader = DataLoader(validation_subset, batch_size=16, shuffle=False)

model.eval()  # Set model to evaluation mode
total_val_loss = 0
with torch.no_grad():  # Disable gradient calculation for validation
    for src, tgt in validation_dataloader:
        output = model(src)  # generates predictions
        val_loss = criterion(output.view(-1, TGT_VOCAB_SIZE), tgt.view(-1))  # calculate validation loss
        total_val_loss += val_loss.item()

avg_val_loss = total_val_loss / len(validation_dataloader)
print(f"Validation Loss: {avg_val_loss:.4f}")


Validation Loss: 15.6962
