In [1]:
# !python -m spacy download fr_core_news_sm

In [2]:
# !pip install transformers

In [3]:
import pandas as pd
import torch
from torch import nn
import spacy
from collections import Counter
from transformers import AutoTokenizer

from transformers import BertTokenizer
from collections import Counter


# Load dataset from Kaggle
data = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv')
data = data[0:150]

# Tokenization
# nlp_en = spacy.load("en_core_web_sm")
# nlp_fr = spacy.load("fr_core_news_sm")


# Load tokenizers
en_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
fr_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize texts
def tokenize(text, lang):
    tokenizer = en_tokenizer if lang == 'en' else fr_tokenizer
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text.lower()))

# Apply tokenization
en_texts = data['en'].apply(lambda x: tokenize(str(x), 'en')).tolist()
fr_texts = data['fr'].apply(lambda x: tokenize(str(x), 'fr')).tolist()

# Build vocabulary
def build_vocab(texts):
    vocab = Counter([token for text in texts for token in text])
    vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common())}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

en_vocab = build_vocab(en_texts)
fr_vocab = build_vocab(fr_texts)

INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(energy, dim=2)
        return nn.functional.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(hidden_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        attention = self.attention(hidden, encoder_outputs)
        encoder_outputs = encoder_outputs.permute(0, 2, 1)
        context = torch.bmm(attention.unsqueeze(1), encoder_outputs).permute(0, 2, 1)
        output = self.fc_out(output.squeeze(1) + context.squeeze(1))
        return output, hidden, cell


In [5]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        output_dim = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(src.device)
        
        hidden, cell = self.encoder(src)
        input = trg[:, 0]  # Start with the <sos> token

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            
            input = trg[:, t] if teacher_force else output.argmax(1)

        return outputs


In [12]:
from torch.utils.data import Dataset, DataLoader


class TranslationDataset(Dataset):
    def __init__(self, src_texts, trg_texts):
        self.src_texts = src_texts
        self.trg_texts = trg_texts

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        return {
            'src': torch.tensor(self.src_texts[idx], dtype=torch.long),
            'trg': torch.tensor(self.trg_texts[idx], dtype=torch.long)
        }

def collate_fn(batch):
    src_batch = [item['src'] for item in batch]
    trg_batch = [item['trg'] for item in batch]

    src_padded = pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_padded = pad_sequence(trg_batch, padding_value=0, batch_first=True)

    return {'src': src_padded, 'trg': trg_padded}


# Create dataset
dataset = TranslationDataset(en_texts, fr_texts)

# Create data loader
train_iterator = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [13]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
    for batch in iterator:
        src = batch['src'].to(device)
        trg = batch['trg'].to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [15]:
import random
from torch.nn.utils.rnn import pad_sequence

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)

# Define the rest of the model parameters
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# Initialize encoder and decoder
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# Initialize optimizer and loss function
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=fr_vocab['<pad>'])

# Train the model for a few epochs
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.3f}')


/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [154,0,0], thread: [5,0,0] Assertion `srcIn

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def translate_sentence(sentence, model, src_field, trg_field, device):
    model.eval()
    
    tokens = [token.lower() for token in tokenize_en(sentence)]
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    
    src_indexes = [src_field.vocab.stoi[t] for t in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for _ in range(100):  # Limit translation length
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        output, hidden, cell = model.decoder(trg_tensor, hidden, cell, encoder_outputs)
        
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    return ' '.join(trg_tokens[1:-1])  # Exclude <sos> and <eos>


In [None]:
example_sentence = "This is a test sentence."
translated_sentence = translate_sentence(example_sentence, model, EN_TEXT, FR_TEXT, device)
print(translated_sentence)
