In [69]:
import torch
import torch.nn as nn
import random
import numpy as np
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [70]:
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [72]:
def read_language_pairs(filepath, max_samples=10000):
    """Read language pairs from file with tab separation"""
    pairs = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if '\t' in line:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    source, target = parts[0], parts[1]
                    pairs.append((source.lower(), target.lower()))

    # Shuffle and limit sample size
    random.shuffle(pairs)
    return pairs[:max_samples]

In [73]:
def create_dictionary(sentences):
    """Create word-to-index dictionary from sentences"""
    word_dict = {'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3}

    # Count all words
    word_counts = Counter()
    for sentence in sentences:
        words = sentence.split()
        word_counts.update(words)

    # Add words to dictionary
    for word in word_counts:
        if word not in word_dict:
            word_dict[word] = len(word_dict)

    return word_dict

In [74]:
def words_to_indices(sentence, word_dict):
    """Convert words in a sentence to indices using dictionary"""
    return [word_dict.get(word, word_dict['<unk>']) for word in sentence.split()]

In [75]:
def prepare_training_data(pairs, source_dict, target_dict):
    """Convert all sentence pairs to tensor format"""
    source_tensors = []
    target_tensors = []

    for source_sent, target_sent in pairs:
        # Convert source sentence
        source_indices = words_to_indices(source_sent, source_dict)
        source_tensor = torch.tensor(source_indices, dtype=torch.long)

        # Convert target sentence with start/end tokens
        target_indices = [target_dict['<start>']]
        target_indices.extend(words_to_indices(target_sent, target_dict))
        target_indices.append(target_dict['<end>'])
        target_tensor = torch.tensor(target_indices, dtype=torch.long)

        source_tensors.append(source_tensor)
        target_tensors.append(target_tensor)

    return source_tensors, target_tensors

In [76]:
def create_batch(source_list, target_list, source_pad_idx, target_pad_idx, batch_size=64):
    """Create a batch of padded sequences"""
    # Get batch indices
    indices = list(range(len(source_list)))
    random.shuffle(indices)
    batch_indices = indices[:batch_size]

    # Get sequences for this batch
    source_batch = [source_list[i] for i in batch_indices]
    target_batch = [target_list[i] for i in batch_indices]

    # Pad sequences
    padded_sources = pad_sequence(source_batch, batch_first=True, padding_value=source_pad_idx)
    padded_targets = pad_sequence(target_batch, batch_first=True, padding_value=target_pad_idx)

    return padded_sources.to(device), padded_targets.to(device)


In [77]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_dim):
        super(LSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, source):
        # source shape: [batch_size, seq_len]
        embedded = self.embedding(source)  # [batch_size, seq_len, embedding_dim]

        # Run through LSTM
        _, (hidden, cell) = self.lstm(embedded)

        # Return final states
        return hidden, cell


In [78]:
class LSTMDecoder(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_dim):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_size)

    def forward(self, target, hidden, cell):
        # target shape: [batch_size]
        embedded = self.embedding(target.unsqueeze(1))  # [batch_size, 1, embedding_dim]

        # Run through LSTM
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))

        # Project to vocabulary size
        prediction = self.output_layer(output.squeeze(1))  # [batch_size, output_size]

        return prediction, hidden, cell

In [79]:
class SimpleTranslator(nn.Module):
    def __init__(self, encoder, decoder):
        super(SimpleTranslator, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        # source: [batch_size, source_len]
        # target: [batch_size, target_len]

        batch_size = target.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.output_layer.out_features

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)

        # Encode the source sequence
        hidden, cell = self.encoder(source)

        # First input to the decoder is the <start> token
        input = target[:, 0]

        # Decode one step at a time
        for t in range(1, target_len):
            # Pass through decoder
            output, hidden, cell = self.decoder(input, hidden, cell)

            # Store prediction
            outputs[:, t] = output

            # Teacher forcing decision
            teacher_force = random.random() < teacher_forcing_ratio

            # Get next input (either from ground truth or prediction)
            input = target[:, t] if teacher_force else output.argmax(1)

        return outputs


In [80]:
def train_translator(model, train_source, train_target, val_source=None, val_target=None,
                    epochs=5, batch_size=64, teacher_forcing=0.5):
    """Train the translator model"""
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=target_dict['<pad>'])

    print("Starting training...")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        batch_count = 0

        # Train on batches
        for i in range(0, len(train_source), batch_size):
            # Create batch
            if i + batch_size <= len(train_source):
                source_batch = train_source[i:i+batch_size]
                target_batch = train_target[i:i+batch_size]

                src, tgt = create_batch(source_batch, target_batch,
                                       source_dict['<pad>'], target_dict['<pad>'],
                                       batch_size=len(source_batch))

                # Forward pass
                output = model(src, tgt, teacher_forcing_ratio=teacher_forcing)

                # Calculate loss (ignoring padding and first token which is <start>)
                output_flat = output[:, 1:].reshape(-1, output.shape[-1])
                target_flat = tgt[:, 1:].reshape(-1)
                loss = criterion(output_flat, target_flat)

                # Backward pass
                optimizer.zero_grad()
                loss.backward()

                # Prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

                # Update weights
                optimizer.step()

                epoch_loss += loss.item()
                batch_count += 1

        # Validation
        if val_source and val_target:
            model.eval()
            with torch.no_grad():
                # Sample validation data
                val_indices = random.sample(range(len(val_source)), min(batch_size, len(val_source)))
                val_src = [val_source[i] for i in val_indices]
                val_tgt = [val_target[i] for i in val_indices]

                src, tgt = create_batch(val_src, val_tgt,
                                       source_dict['<pad>'], target_dict['<pad>'],
                                       batch_size=len(val_src))

                output = model(src, tgt, teacher_forcing_ratio=0)
                val_loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]),
                                    tgt[:, 1:].reshape(-1))

            print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/batch_count:.4f}, Val Loss: {val_loss:.4f}")
        else:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/batch_count:.4f}")


In [81]:
def translate_text(model, sentence, max_length=50):
    """Translate a sentence using the trained model"""
    model.eval()

    # Tokenize the sentence
    tokens = words_to_indices(sentence.lower(), source_dict)
    token_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    # Encode
    hidden, cell = model.encoder(token_tensor)

    # Start with <start> token
    input_token = torch.tensor([target_dict['<start>']]).to(device)

    translated_tokens = []

    # Generate translation
    for _ in range(max_length):
        with torch.no_grad():
            output, hidden, cell = model.decoder(input_token, hidden, cell)

        # Get predicted token
        predicted_token = output.argmax(1).item()

        # Stop if end token
        if predicted_token == target_dict['<end>']:
            break

        translated_tokens.append(predicted_token)

        # Next input is predicted token
        input_token = torch.tensor([predicted_token], device=device)

    # Convert indices back to words
    translated_words = [index_to_target.get(idx, '<unk>') for idx in translated_tokens]

    return ' '.join(translated_words)

In [82]:
def evaluate_bleu(model, test_source, test_target, num_examples=100):
    """Calculate BLEU score on test data"""
    model.eval()
    references = []
    hypotheses = []

    # Limit number of examples
    samples = min(num_examples, len(test_source))

    for i in range(samples):
        # Get source sentence
        source_words = [index_to_source[idx.item()] for idx in test_source[i]]
        source_sentence = ' '.join(source_words)

        # Get reference translation (remove <start> and <end>)
        reference = [index_to_target[idx.item()] for idx in test_target[i]]
        if reference[0] == '<start>':
            reference = reference[1:]
        if '<end>' in reference:
            reference = reference[:reference.index('<end>')]

        # Get model translation
        translation = translate_text(model, source_sentence).split()

        references.append([reference])
        hypotheses.append(translation)

    # Calculate BLEU score with smoothing
    smooth = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth)

    return bleu_score


In [83]:
data_path = "/content/dataset.txt"
language_pairs = read_language_pairs(data_path, max_samples=10000)

# Split source and target languages
source_sentences = [pair[0] for pair in language_pairs]
target_sentences = [pair[1] for pair in language_pairs]

# Create dictionaries
source_dict = create_dictionary(source_sentences)
target_dict = create_dictionary(target_sentences)

# Create reverse mappings
index_to_source = {idx: word for word, idx in source_dict.items()}
index_to_target = {idx: word for word, idx in target_dict.items()}

# Prepare data
source_data, target_data = prepare_training_data(language_pairs, source_dict, target_dict)

# Split data
source_train, source_test, target_train, target_test = train_test_split(
    source_data, target_data, test_size=0.1, random_state=SEED)

source_train, source_val, target_train, target_val = train_test_split(
    source_train, target_train, test_size=0.1, random_state=SEED)

# Create model
EMBEDDING_DIM = 256
HIDDEN_DIM = 512

# Initialize model components
encoder = LSTMEncoder(len(source_dict), EMBEDDING_DIM, HIDDEN_DIM).to(device)
decoder = LSTMDecoder(len(target_dict), EMBEDDING_DIM, HIDDEN_DIM).to(device)
model = SimpleTranslator(encoder, decoder).to(device)

# Train the model
train_translator(
    model,
    source_train, target_train,
    val_source=source_val, val_target=target_val,
    epochs=200,
    batch_size=64,
    teacher_forcing=0.5
)

# Test translation examples
test_sentences = [
    "I am happy.",
    "How are you?",
    "Where is the restaurant?",
    "Thank you very much."
]

print("\nTranslation examples:")
for sentence in test_sentences:
    translation = translate_text(model, sentence)
    print(f"English: {sentence}")
    print(f"Spanish: {translation}")
    print()

# Calculate BLEU score
bleu = evaluate_bleu(model, source_test, target_test, num_examples=100)
print(f"BLEU Score: {bleu:.4f}")

Starting training...
Epoch 1/200, Loss: 6.5574, Val Loss: 6.2627
Epoch 2/200, Loss: 5.8222, Val Loss: 6.1216
Epoch 3/200, Loss: 5.4977, Val Loss: 6.4002
Epoch 4/200, Loss: 5.1625, Val Loss: 5.9553
Epoch 5/200, Loss: 4.8117, Val Loss: 6.0286
Epoch 6/200, Loss: 4.5061, Val Loss: 6.1359
Epoch 7/200, Loss: 4.1452, Val Loss: 6.0226
Epoch 8/200, Loss: 3.8482, Val Loss: 5.7406
Epoch 9/200, Loss: 3.5530, Val Loss: 5.7089
Epoch 10/200, Loss: 3.2282, Val Loss: 6.0160
Epoch 11/200, Loss: 2.9906, Val Loss: 6.4006
Epoch 12/200, Loss: 2.6721, Val Loss: 6.4148
Epoch 13/200, Loss: 2.4595, Val Loss: 6.1318
Epoch 14/200, Loss: 2.1660, Val Loss: 6.7837
Epoch 15/200, Loss: 1.8865, Val Loss: 6.3859
Epoch 16/200, Loss: 1.6532, Val Loss: 6.1936
Epoch 17/200, Loss: 1.4011, Val Loss: 6.8556
Epoch 18/200, Loss: 1.1607, Val Loss: 6.7325
Epoch 19/200, Loss: 1.0084, Val Loss: 6.7755
Epoch 20/200, Loss: 0.8142, Val Loss: 6.6876
Epoch 21/200, Loss: 0.6848, Val Loss: 6.8457
Epoch 22/200, Loss: 0.5632, Val Loss: 6.678

In [84]:
data_path = "/content/dataset.txt"
language_pairs = read_language_pairs(data_path, max_samples=10000)

# Split source and target languages
source_sentences = [pair[0] for pair in language_pairs]
target_sentences = [pair[1] for pair in language_pairs]

# Create dictionaries
source_dict = create_dictionary(source_sentences)
target_dict = create_dictionary(target_sentences)

# Create reverse mappings
index_to_source = {idx: word for word, idx in source_dict.items()}
index_to_target = {idx: word for word, idx in target_dict.items()}

# Prepare data
source_data, target_data = prepare_training_data(language_pairs, source_dict, target_dict)

# Split data
source_train, source_test, target_train, target_test = train_test_split(
    source_data, target_data, test_size=0.1, random_state=SEED)

source_train, source_val, target_train, target_val = train_test_split(
    source_train, target_train, test_size=0.1, random_state=SEED)

# Create model
EMBEDDING_DIM = 256
HIDDEN_DIM = 512

# Initialize model components
encoder = LSTMEncoder(len(source_dict), EMBEDDING_DIM, HIDDEN_DIM).to(device)
decoder = LSTMDecoder(len(target_dict), EMBEDDING_DIM, HIDDEN_DIM).to(device)
model = SimpleTranslator(encoder, decoder).to(device)

# Train the model
train_translator(
    model,
    source_train, target_train,
    val_source=source_val, val_target=target_val,
    epochs=10,
    batch_size=64,
    teacher_forcing=0.5
)

# Test translation examples
test_sentences = [
    "I am happy.",
    "How are you?",
    "Where is the restaurant?",
    "Thank you very much."
]

print("\nTranslation examples:")
for sentence in test_sentences:
    translation = translate_text(model, sentence)
    print(f"English: {sentence}")
    print(f"Spanish: {translation}")
    print()

# Calculate BLEU score
bleu = evaluate_bleu(model, source_test, target_test, num_examples=100)
print(f"BLEU Score: {bleu:.4f}")

Starting training...
Epoch 1/10, Loss: 6.5188, Val Loss: 6.2810
Epoch 2/10, Loss: 5.7962, Val Loss: 6.2890
Epoch 3/10, Loss: 5.4698, Val Loss: 6.3293
Epoch 4/10, Loss: 5.1227, Val Loss: 6.2469
Epoch 5/10, Loss: 4.7747, Val Loss: 5.8073
Epoch 6/10, Loss: 4.4100, Val Loss: 5.8662
Epoch 7/10, Loss: 4.0957, Val Loss: 6.2728
Epoch 8/10, Loss: 3.7999, Val Loss: 6.1352
Epoch 9/10, Loss: 3.4439, Val Loss: 6.2343
Epoch 10/10, Loss: 3.1067, Val Loss: 5.8566

Translation examples:
English: I am happy.
Spanish: estoy muy

English: How are you?
Spanish: ¿qué son

English: Where is the restaurant?
Spanish: ¿dónde está el padre?

English: Thank you very much.
Spanish: te

BLEU Score: 0.0458
