In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import nltk
import random
import tqdm
import pickle

nltk.download('punkt')

# Encoder model
class Encoder_Model(nn.Module):
    def __init__(self, d_model, num_heads, devices, batch_size):
        super(Encoder_Model, self).__init__()

        self.batch_size = batch_size
        self.devices = devices
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = int(self.d_model / num_heads)

        # Multihead Attention
        self.query = nn.Linear(self.d_model, self.d_model)
        self.key = nn.Linear(self.d_model, self.d_model)
        self.value = nn.Linear(self.d_model, self.d_model)
        self.concat_scaled_dot_product = nn.Linear(self.d_model, self.d_model)

        # Feed Forward
        self.feed_forward1 = nn.Linear(self.d_model, 2048)
        self.feed_forward2 = nn.Linear(2048, self.d_model)

    # Multihead Attention
    def Multihead_Attention(self, data):
        batch_size, seq_len, _ = data.shape
        query = self.query(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        key = self.key(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        value = self.value(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        dot_product = torch.matmul(query, torch.transpose(key, -2, -1)) / ((self.d_k) ** 0.5)
        scaled_dot = torch.nn.functional.softmax(dot_product, dim=-1)

        scaled_dot = torch.matmul(scaled_dot, value)
        scaled_dot = scaled_dot.permute(0, 2, 1, 3)

        concat_scaled_dot_product = scaled_dot.reshape(batch_size, seq_len, -1)
        concat_scaled_dot_product = self.concat_scaled_dot_product(concat_scaled_dot_product)

        return concat_scaled_dot_product

    # Feed Forward
    def Feed_Forward(self, data):
        data = self.feed_forward1(data)
        data = torch.nn.functional.relu(data)
        data = self.feed_forward2(data)

        return data

    def forward(self, data):
        mhe_data = self.Multihead_Attention(data=data)
        norm_data = nn.functional.layer_norm((mhe_data + data), normalized_shape=mhe_data.shape)

        feed_forward = self.Feed_Forward(data=norm_data)
        data = nn.functional.layer_norm((norm_data + feed_forward), normalized_shape=feed_forward.shape)

        return data

# Decoder model
class Decoder_Model(nn.Module):
    def __init__(self, devices, d_model, num_heads, batch_size, masking_value=-1e8):
        super(Decoder_Model, self).__init__()

        self.batch_size = batch_size
        self.masking_value = masking_value
        self.devices = devices
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = int(self.d_model / num_heads)

        # Masked Multi Head Attention
        self.query_m = nn.Linear(self.d_model, d_model)
        self.key_m = nn.Linear(self.d_model, self.d_model)
        self.value_m = nn.Linear(self.d_model, self.d_model)
        self.concat_scaled_dot_product_m = nn.Linear(self.d_model, self.d_model)

        # Multihead Attention
        self.query = nn.Linear(self.d_model, self.d_model)
        self.key = nn.Linear(self.d_model, self.d_model)
        self.value = nn.Linear(self.d_model, self.d_model)
        self.concat_scaled_dot_product = nn.Linear(self.d_model, self.d_model)

        # Feed Forward
        self.feed_forward1 = nn.Linear(self.d_model, 2048)
        self.feed_forward2 = nn.Linear(2048, self.d_model)

    # Masked Multi Head Attention
    def Masked_Multihead_Attention(self, data):
        batch_size, seq_len, _ = data.shape
        query = self.query_m(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        key = self.key_m(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        value = self.value_m(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        dot_product = torch.matmul(query, torch.transpose(key, -1, -2)) / (self.d_k ** 0.5)

        mask = torch.triu(torch.ones_like(dot_product), diagonal=1).to(self.devices)
        mask_data = self.masking_value * mask

        masked_product = mask_data + dot_product
        scaled_dot = torch.nn.functional.softmax(masked_product, dim=-1)
        scaled_dot = torch.matmul(scaled_dot, value)

        scaled_dot = scaled_dot.permute(0, 2, 1, 3)
        concat_scaled_dot_product = scaled_dot.reshape(batch_size, seq_len, -1)
        concat_scaled_dot_product = self.concat_scaled_dot_product_m(concat_scaled_dot_product)

        return concat_scaled_dot_product

    # Multi Head Attention
    def Multihead_Attention(self, data, encoder_out):
        batch_size, seq_len, _ = data.shape
        _, encoder_seq_len, _ = encoder_out.shape
        query = self.query(data).reshape(batch_size, seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        key = self.key(encoder_out).reshape(batch_size, encoder_seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)
        value = self.value(encoder_out).reshape(batch_size, encoder_seq_len, self.num_heads, self.d_k).permute(0, 2, 1, 3)

        dot_product = torch.matmul(query, torch.transpose(key, -1, -2)) / ((self.d_k) ** 0.5)
        scaled_dot = torch.nn.functional.softmax(dot_product, dim=-1)
        scaled_dot = torch.matmul(scaled_dot, value)

        scaled_dot = scaled_dot.permute(0, 2, 1, 3)
        concat_scaled_dot_product = scaled_dot.reshape(batch_size, seq_len, -1)
        concat_scaled_dot_product = self.concat_scaled_dot_product(concat_scaled_dot_product)

        return concat_scaled_dot_product

    # Feed Forward
    def Feed_Forward(self, data):
        data = self.feed_forward1(data)
        data = torch.nn.functional.relu(data)
        data = self.feed_forward2(data)

        return data

    def forward(self, data):
        encoder_out, data_dec = data

        mmhe_data = self.Masked_Multihead_Attention(data=data_dec)
        norm_mmhe = nn.functional.layer_norm((mmhe_data + data_dec), normalized_shape=mmhe_data.shape)

        mhe_data = self.Multihead_Attention(data=norm_mmhe, encoder_out=encoder_out)
        norm_mhe = nn.functional.layer_norm((mhe_data + norm_mmhe), normalized_shape=mhe_data.shape)

        feed_forward = self.Feed_Forward(data=norm_mhe)
        data_dec = nn.functional.layer_norm((norm_mhe + feed_forward), normalized_shape=feed_forward.shape)

        return data_dec

# Embedding model
class Embedding_Model(nn.Module):
    def __init__(self, vocab_size, d_model, pad_idx, devices, max_seq_len):
        super(Embedding_Model, self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        self.devices = devices

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model, padding_idx=pad_idx)

    def Positional_Encoding(self, seq_len):
        position = torch.arange(0, seq_len).reshape((seq_len, 1)).to(self.devices)
        even_i = torch.arange(0, self.d_model, 2).to(self.devices)
        odd_i = torch.arange(1, self.d_model, 2).to(self.devices)

        pow_even = torch.pow(10000, -even_i / self.d_model)
        pow_odd = torch.pow(10000, -odd_i / self.d_model)

        PE_even = torch.sin(position * pow_even)
        PE_odd = torch.cos(position * pow_odd)

        PE = torch.zeros((seq_len, self.d_model), device=self.devices)
        PE[:, even_i] = PE_even
        PE[:, odd_i] = PE_odd

        return PE.unsqueeze(0)  # Adding batch dimension

    def forward(self, data):
        embedded_data = self.embedding(data)
        seq_len = embedded_data.size(1)
        PE = self.Positional_Encoding(seq_len)

        return embedded_data + PE

# Dataset
class Translate_Dataset(Dataset):
    def __init__(self, dataset, word2index_source, word2index_target, max_seq_len, devices):
        self.dataset = dataset
        self.word2index_source = word2index_source
        self.word2index_target = word2index_target
        self.max_seq_len = max_seq_len
        self.devices = devices

    def __len__(self):
        return len(self.dataset)

    def tokenize_and_pad(self, sentence, word2index):
        tokens = nltk.word_tokenize(sentence.lower())
        indices = [word2index.get(word, word2index['<unk>']) for word in tokens]
        if len(indices) < self.max_seq_len:
            indices += [word2index['<pad>']] * (self.max_seq_len - len(indices))
        else:
            indices = indices[:self.max_seq_len]
        return torch.tensor(indices, device=self.devices)

    def __getitem__(self, idx):
        source_sentence, target_sentence = self.dataset[idx]
        source_indices = self.tokenize_and_pad(source_sentence, self.word2index_source)
        target_indices = self.tokenize_and_pad(target_sentence, self.word2index_target)

        return source_indices, target_indices

# Transformer Model
class Transformer_Model(nn.Module):
    def __init__(self, devices, vocab_size_source, vocab_size_target, max_seq_len, pad_idx, d_model, num_heads, num_encoder_layers, num_decoder_layers, batch_size):
        super(Transformer_Model, self).__init__()

        self.encoder_layers = nn.ModuleList([Encoder_Model(d_model=d_model, num_heads=num_heads, devices=devices, batch_size=batch_size) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([Decoder_Model(devices=devices, d_model=d_model, num_heads=num_heads, batch_size=batch_size) for _ in range(num_decoder_layers)])

        self.source_embedding = Embedding_Model(vocab_size=vocab_size_source, d_model=d_model, pad_idx=pad_idx, devices=devices, max_seq_len=max_seq_len)
        self.target_embedding = Embedding_Model(vocab_size=vocab_size_target, d_model=d_model, pad_idx=pad_idx, devices=devices, max_seq_len=max_seq_len)

        self.linear = nn.Linear(d_model, vocab_size_target)

    def forward(self, data):
        source_data, target_data = data

        encoder_data = self.source_embedding(source_data)
        decoder_data = self.target_embedding(target_data)

        for encoder_layer in self.encoder_layers:
            encoder_data = encoder_layer(data=encoder_data)

        for decoder_layer in self.decoder_layers:
            decoder_data = decoder_layer(data=(encoder_data, decoder_data))

        output = self.linear(decoder_data)

        return output

# Data Loading Functions
def load_data(turkish_file, english_file):
    with open(turkish_file, 'r', encoding='utf-8') as f:
        turkish_sentences = f.readlines()

    with open(english_file, 'r', encoding='utf-8') as f:
        english_sentences = f.readlines()

    data = list(zip(turkish_sentences, english_sentences))
    return data

def build_vocab(sentences, min_freq=1):
    word_freq = {}
    for sentence in sentences:
        for word in nltk.word_tokenize(sentence.lower()):
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1

    word2index = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for word, freq in word_freq.items():
        if freq >= min_freq:
            word2index[word] = len(word2index)

    return word2index

# Load Data
data = load_data('train.tr.txt', 'train.en.txt')

# Build Vocabulary
turkish_sentences = [pair[0] for pair in data]
english_sentences = [pair[1] for pair in data]

word2index_source = build_vocab(turkish_sentences)
word2index_target = build_vocab(english_sentences)

# Save vocabulary for future use
with open('word2index_source.pkl', 'wb') as f:
    pickle.dump(word2index_source, f)

with open('word2index_target.pkl', 'wb') as f:
    pickle.dump(word2index_target, f)

# Split Data into Train and Test Sets
random.shuffle(data)
split_idx = int(len(data) * 0.3)
train_data = data[:split_idx]
test_data = data[split_idx:]

# Set Parameters
max_seq_len = 16
devices = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 64

# Create Datasets and DataLoaders
train_dataset = Translate_Dataset(train_data, word2index_source, word2index_target, max_seq_len, devices)
test_dataset = Translate_Dataset(test_data, word2index_source, word2index_target, max_seq_len, devices)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize Model
d_model = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
pad_idx = 0
vocab_size_source = len(word2index_source)
vocab_size_target = len(word2index_target)

model = Transformer_Model(devices=devices, vocab_size_source=vocab_size_source, vocab_size_target=vocab_size_target, max_seq_len=max_seq_len, pad_idx=pad_idx, d_model=d_model, num_heads=num_heads, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, batch_size=batch_size)
model = model.to(devices)

# Training Setup
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training Loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for i, (source, target) in enumerate(train_loader):
        optimizer.zero_grad()

        source = source.to(devices).long()
        target = target.to(devices).long()

        output = model((source, target[:, :-1]))
        output = output.view(-1, output.shape[-1])
        target = target[:, 1:].reshape(-1)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), 'transformer_model.pth')

# Evaluation on the Test Set
model.eval()
with torch.no_grad():
    total_loss = 0
    for i, (source, target) in enumerate(test_loader):
        source = source.to(devices).long()
        target = target.to(devices).long()

        output = model((source, target[:, :-1]))
        output = output.view(-1, output.shape[-1])
        target = target[:, 1:].reshape(-1)

        loss = criterion(output, target)
        total_loss += loss.item()

    print(f"Test Loss: {total_loss / len(test_loader)}")

# Translation Function
def translate_sentence(model, sentence, word2index_source, index2word_target, max_seq_len, devices):
    model.eval()
    with torch.no_grad():
        # Tokenize and convert to indices
        tokens = nltk.word_tokenize(sentence.lower())
        indices = [word2index_source.get(word, word2index_source['<unk>']) for word in tokens]
        if len(indices) < max_seq_len:
            indices += [word2index_source['<pad>']] * (max_seq_len - len(indices))
        else:
            indices = indices[:max_seq_len]
        source_indices = torch.tensor(indices, device=devices).unsqueeze(0).long()

        # Prepare target tensor with <sos> token
        target_indices = torch.tensor([word2index_target['<pad>']] * max_seq_len, device=devices).unsqueeze(0).long()
        target_indices[0, 0] = word2index_target['<sos>']

        # Translate word by word
        for i in range(1, max_seq_len):
            output = model((source_indices, target_indices[:, :i]))
            next_word_idx = torch.argmax(output[:, -1, :], dim=-1)
            target_indices[0, i] = next_word_idx.item()
            if next_word_idx.item() == word2index_target['<eos>']:
                break

        # Convert indices back to words
        translated_sentence = [index2word_target[idx.item()] for idx in target_indices[0] if idx.item() not in {word2index_target['<pad>'], word2index_target['<sos>'], word2index_target['<eos>']}]
        return ' '.join(translated_sentence)

# Translate 10 random sentences
random_sentences = random.sample(turkish_sentences, 10)

# Create index2word mapping for the target vocabulary
index2word_target = {index: word for word, index in word2index_target.items()}

# Translate and print each sentence
translations = []
for sentence in random_sentences:
    translation = translate_sentence(model, sentence, word2index_source, index2word_target, max_seq_len, devices)
    translations.append((sentence.strip(), translation))
    print(f"Original: {sentence.strip()}")
    print(f"Translation: {translation}\n")

# Save translations
with open('translations.txt', 'w', encoding='utf-8') as f:
    for original, translation in translations:
        f.write(f"Original: {original}\n")
        f.write(f"Translation: {translation}\n\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch [1/5], Step [1/2603], Loss: 10.672295570373535
Epoch [1/5], Step [11/2603], Loss: 8.48673152923584
Epoch [1/5], Step [21/2603], Loss: 7.570736408233643
Epoch [1/5], Step [31/2603], Loss: 6.881667137145996
Epoch [1/5], Step [41/2603], Loss: 6.284490585327148
Epoch [1/5], Step [51/2603], Loss: 5.789129734039307
Epoch [1/5], Step [61/2603], Loss: 5.5774407386779785
Epoch [1/5], Step [71/2603], Loss: 5.571192264556885
Epoch [1/5], Step [81/2603], Loss: 5.478513717651367
Epoch [1/5], Step [91/2603], Loss: 5.3568854331970215
Epoch [1/5], Step [101/2603], Loss: 5.2502055168151855
Epoch [1/5], Step [111/2603], Loss: 5.2531890869140625
Epoch [1/5], Step [121/2603], Loss: 5.186863899230957
Epoch [1/5], Step [131/2603], Loss: 4.927368640899658
Epoch [1/5], Step [141/2603], Loss: 5.029376983642578
Epoch [1/5], Step [151/2603], Loss: 5.169938087463379
Epoch [1/5], Step [161/2603], Loss: 5.059008598327637
Epoch [1/5], Step [171/2603], Loss: 4.820137023925781
Epoch [1/5], Step [181/2603], Loss: