In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/language-translation-englishfrench/eng_-french.csv


In [14]:
seed=42

In [15]:
data_path = '/kaggle/input/language-translation-englishfrench/eng_-french.csv'


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader, Subset
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Install nltk if not already installed (e.g., in Colab)
# Uncomment the following lines if running in Colab
# !pip install nltk
nltk.download('punkt')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Vocabulary class
class Vocabulary:
    def __init__(self):
        self.word2idx = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        self.idx2word = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.n_words = 4

    def add_sentence(self, sentence):
        for word in sentence.lower().split():
            if word not in self.word2idx:
                self.word2idx[word] = self.n_words
                self.idx2word[self.n_words] = word
                self.n_words += 1

# Dataset class with truncation
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, max_seq_len=100):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_seq_len = max_seq_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        src_sentence = self.data.iloc[idx]['en'].lower().split()
        tgt_sentence = self.data.iloc[idx]['fr'].lower().split()
        
        src_tokens = [self.src_vocab.word2idx['<sos>']] + \
                     [self.src_vocab.word2idx.get(word, self.src_vocab.word2idx['<unk>']) for word in src_sentence[:self.max_seq_len-2]] + \
                     [self.src_vocab.word2idx['<eos>']]
        tgt_tokens = [self.tgt_vocab.word2idx['<sos>']] + \
                     [self.tgt_vocab.word2idx.get(word, self.tgt_vocab.word2idx['<unk>']) for word in tgt_sentence[:self.max_seq_len-2]] + \
                     [self.tgt_vocab.word2idx['<eos>']]
        
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

# Collate function with padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=0)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=0)
    return src_batch, tgt_batch

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)
        self.W_O = nn.Linear(d_model, d_model)
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.W_Q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.W_K(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.W_V(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        p_attn = F.softmax(scores, dim=-1)
        attended = torch.matmul(p_attn, value)
        attended = attended.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_O(attended)
        return output

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8, num_layers=6, d_ff=2048, max_seq_len=100, dropout=0.1):
        super(Transformer, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask
    
    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.src_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.tgt_embedding(tgt)))
        
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        
        output = self.fc(dec_output)
        return output

# Greedy decoding function for inference
def greedy_decode(model, src, tgt_vocab, max_len=100):
    model.eval()
    src = src.to(device)
    batch_size = src.size(0)
    with torch.no_grad():
        src_mask, _ = model.generate_mask(src, torch.zeros(batch_size, 1).long().to(device))
        src_embedded = model.positional_encoding(model.src_embedding(src))
        enc_output = src_embedded
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        
        tgt = torch.tensor([[tgt_vocab.word2idx['<sos>']]] * batch_size, dtype=torch.long).to(device)
        for _ in range(max_len):
            tgt_mask = model.generate_mask(src, tgt)[1]
            tgt_embedded = model.positional_encoding(model.tgt_embedding(tgt))
            dec_output = tgt_embedded
            for dec_layer in model.decoder_layers:
                dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
            output = model.fc(dec_output)
            next_word = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            tgt = torch.cat([tgt, next_word], dim=1)
            if next_word.item() == tgt_vocab.word2idx['<eos>']:
                break
    translated = [tgt_vocab.idx2word[idx.item()] for idx in tgt[0]]
    return ' '.join(translated[1:])

# BLEU evaluation function with smoothing
def evaluate_bleu(model, test_loader, src_vocab, tgt_vocab):
    model.eval()
    bleu_scores = []
    smoothing = SmoothingFunction().method1  # Use smoothing to handle zero n-gram counts
    with torch.no_grad():
        for src, tgt in test_loader:
            src = src.transpose(0, 1).to(device)
            tgt = tgt.transpose(0, 1).to(device)
            for i in range(src.size(0)):
                src_input = src[i:i+1]
                ref_tokens = tgt[i].tolist()
                ref_text = ' '.join([tgt_vocab.idx2word[t] for t in ref_tokens if t not in [0, tgt_vocab.word2idx['<sos>'], tgt_vocab.word2idx['<eos>']]])
                pred_text = greedy_decode(model, src_input, tgt_vocab)
                ref_words = ref_text.split()
                pred_words = pred_text.split()
                bleu = sentence_bleu([ref_words], pred_words, smoothing_function=smoothing)
                bleu_scores.append(bleu)
                
                # Debug: Print first 5 translations per epoch to check quality
                if i < 5:
                    print(f"Reference: {ref_text}")
                    print(f"Predicted: {pred_text}")
                    print(f"BLEU for this pair: {bleu:.4f}\n")
    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0

# Load and prepare data
data = pd.read_csv(data_path)
data = data.sample(n=25000, random_state=42)
data.columns = ['en', 'fr']

src_vocab = Vocabulary()
tgt_vocab = Vocabulary()
for row in data.itertuples():
    src_vocab.add_sentence(row.en)
    tgt_vocab.add_sentence(row.fr)

# Split dataset into train and test
dataset = TranslationDataset(data, src_vocab, tgt_vocab, max_seq_len=100)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = Subset(dataset, range(train_size)), Subset(dataset, range(train_size, len(dataset)))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Model parameters
src_vocab_size = src_vocab.n_words
tgt_vocab_size = tgt_vocab.n_words
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_len = 100
dropout = 0.1

# Initialize model
model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 25
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    num_batches = len(train_loader)
    print(f"\nStarting Epoch {epoch+1}/{num_epochs} with {num_batches} batches")
    
    for batch_idx, (src, tgt) in enumerate(train_loader):
        src = src.transpose(0, 1).to(device)
        tgt = tgt.transpose(0, 1).to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]
        
        optimizer.zero_grad()
        output = model(src, tgt_input)
        loss = criterion(output.reshape(-1, tgt_vocab_size), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx+1}/{num_batches}, Loss: {loss.item():.4f}")
    
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs} Completed, Average Loss: {avg_loss:.4f}")
    



# Evaluate BLEU on test set after each epoch
bleu_score = evaluate_bleu(model, test_loader, src_vocab, tgt_vocab)
print(f"Epoch {epoch+1}/{num_epochs}, BLEU Score: {bleu_score:.4f}")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Starting Epoch 1/25 with 313 batches
Epoch 1/25, Batch 1/313, Loss: 9.9753
Epoch 1/25, Batch 2/313, Loss: 8.8086
Epoch 1/25, Batch 3/313, Loss: 8.5929
Epoch 1/25, Batch 4/313, Loss: 8.3495
Epoch 1/25, Batch 5/313, Loss: 8.2424
Epoch 1/25, Batch 6/313, Loss: 8.2066
Epoch 1/25, Batch 7/313, Loss: 8.1185
Epoch 1/25, Batch 8/313, Loss: 7.9921
Epoch 1/25, Batch 9/313, Loss: 7.8752
Epoch 1/25, Batch 10/313, Loss: 7.8844
Epoch 1/25, Batch 11/313, Loss: 7.7426
Epoch 1/25, Batch 12/313, Loss: 7.8141
Epoch 1/25, Batch 13/313, Loss: 7.7761
Epoch 1/25, Batch 14/313, Loss: 7.7332
Epoch 1/25, Batch 15/313, Loss: 7.7053
Epoch 1/25, Batch 16/313, Loss: 7.6884
Epoch 1/25, Batch 17/313, Loss: 7.6164
Epoch 1/25, Batch 18/313, Loss: 7.5484
Epoch 1/25, Batch 19/313, Loss: 7.4438
Epoch 1/25, Batch 20/313, Loss: 7.3205
Epoch 1/25, Batch 21/313, Loss: 7.4212
Epoch 1/25, Batch 22/313, Loss: 7.24

In [23]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, max_len=100):
    model.eval()
    tokens = [src_vocab.word2idx['<sos>']] + \
             [src_vocab.word2idx.get(word, src_vocab.word2idx['<unk>']) for word in sentence.lower().split()] + \
             [src_vocab.word2idx['<eos>']]
    src = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)  # (1, seq_len)
    
    with torch.no_grad():
        tgt = torch.tensor([tgt_vocab.word2idx['<sos>']], dtype=torch.long).unsqueeze(0).to(device)  # (1, 1)
        for _ in range(max_len):
            output = model(src, tgt)  # (1, tgt_seq_len, tgt_vocab_size)
            next_word_idx = output[:, -1, :].argmax(dim=-1).item()
            if next_word_idx == tgt_vocab.word2idx['<eos>']:
                break
            tgt = torch.cat([tgt, torch.tensor([[next_word_idx]], dtype=torch.long).to(device)], dim=1)
    
    translated = [tgt_vocab.idx2word[idx.item()] for idx in tgt[0]]
    return ' '.join(translated[1:])  # Skip <sos>

# Test after training
test_sentence = "how are you doing"
translation = translate_sentence(model, test_sentence, src_vocab, tgt_vocab)
print(f"Input: {test_sentence}")
print(f"Translation: {translation}")

Input: how are you doing
Translation: comment faites-vous ?
