<a href="https://colab.research.google.com/github/burakemretetik/dl_with_py/blob/main/encoder_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# --- 1. AYARLAR VE SABİTLER ---
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 64
MAX_LEN = 20  # Çok uzun cümleleri almayalım (eğitim hızı için)
NUM_SAMPLES = 30000  # Veri setinden alınacak örnek sayısı
N_EPOCHS = 10
HIDDEN_DIM = 256
EMBED_DIM = 128
LR = 0.001

In [2]:
# --- 2. VERİYİ İNDİRME VE HAZIRLAMA ---
if not os.path.exists("deu.txt"):
    print("Veri indiriliyor...")
    os.system("wget -q http://www.manythings.org/anki/deu-eng.zip")
    os.system("unzip -q -o deu-eng.zip")
    print("Veri indirildi.")

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?üöäß]+", r" ", text)
    return text.strip()

# Kelime Dağarcığı Sınıfı
class Vocabulary:
    def __init__(self):
        self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.count = 4

    def add_sentence(self, sentence):
        for word in sentence.split():
            if word not in self.word2idx:
                self.word2idx[word] = self.count
                self.idx2word[self.count] = word
                self.count += 1

    def encode(self, sentence):
        return [self.word2idx.get(w, 3) for w in sentence.split()]

# Veriyi Oku ve İşle
eng_vocab = Vocabulary()
deu_vocab = Vocabulary()
pairs = []

print("Veri işleniyor...")
with open("deu.txt", "r", encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

    # Hızlı eğitim için ilk N cümleyi alıyoruz
    for line in lines[:NUM_SAMPLES]:
        parts = line.split("\t")
        if len(parts) >= 2:
            eng = clean_text(parts[0])
            deu = clean_text(parts[1])

            # Çok uzun cümleleri ele
            if len(eng.split()) < MAX_LEN and len(deu.split()) < MAX_LEN:
                eng_vocab.add_sentence(eng)
                deu_vocab.add_sentence(deu)
                pairs.append((eng, deu))

print(f"Toplam Cümle: {len(pairs)}")
print(f"İngilizce Kelime Sayısı: {eng_vocab.count}")
print(f"Almanca Kelime Sayısı: {deu_vocab.count}")

# Dataset ve DataLoader
class TransDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, deu = self.pairs[idx]
        eng_idx = [eng_vocab.word2idx["<SOS>"]] + eng_vocab.encode(eng) + [eng_vocab.word2idx["<EOS>"]]
        deu_idx = [deu_vocab.word2idx["<SOS>"]] + deu_vocab.encode(deu) + [deu_vocab.word2idx["<EOS>"]]
        return torch.tensor(eng_idx), torch.tensor(deu_idx)

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_pad = pad_sequence(src_batch, batch_first=True, padding_value=0)
    trg_pad = pad_sequence(trg_batch, batch_first=True, padding_value=0)
    return src_pad, trg_pad

train_loader = DataLoader(TransDataset(pairs), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

Veri işleniyor...
Toplam Cümle: 30000
İngilizce Kelime Sayısı: 4493
Almanca Kelime Sayısı: 7300


In [3]:
# --- 3. MODEL MİMARİSİ (ENCODER-DECODER) ---
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input, hidden):
        input = input.unsqueeze(1) # [batch, 1]
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(device)
        hidden = self.encoder(src)

        input = trg[:, 0] # İlk input <SOS>

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs

# Modeli Başlat
enc = Encoder(eng_vocab.count, EMBED_DIM, HIDDEN_DIM)
dec = Decoder(deu_vocab.count, EMBED_DIM, HIDDEN_DIM)
model = Seq2Seq(enc, dec).to(device)

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=0) # PAD'i yoksay

In [4]:
# --- 4. EĞİTİM DÖNGÜSÜ ---
print(f"\nEğitim Başlıyor ({N_EPOCHS} Epoch)...")
model.train()

for epoch in range(N_EPOCHS):
    epoch_loss = 0
    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)

        # Loss hesabı (SOS token hariç)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # Clipping
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {epoch_loss / len(train_loader):.4f}")


Eğitim Başlıyor (10 Epoch)...
Epoch 1 | Loss: 3.9216
Epoch 2 | Loss: 2.9634
Epoch 3 | Loss: 2.5968
Epoch 4 | Loss: 2.3506
Epoch 5 | Loss: 2.1637
Epoch 6 | Loss: 2.0014
Epoch 7 | Loss: 1.8728
Epoch 8 | Loss: 1.7573
Epoch 9 | Loss: 1.6627
Epoch 10 | Loss: 1.5681


In [6]:
def translate_sentence(sentence):
    model.eval()
    sentence = clean_text(sentence)
    tokens = [eng_vocab.word2idx.get(w, 3) for w in sentence.split()]
    tokens = [1] + tokens + [2]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)

    with torch.no_grad():
        hidden = model.encoder(src_tensor)

    trg_indexes = [1]
    for _ in range(MAX_LEN):
        # BURADAKİ HATA DÜZELTİLDİ: .unsqueeze(0) KALDIRILDI
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)

        pred_token = output.argmax(1).item()
        if pred_token == 2:
            break
        trg_indexes.append(pred_token)

    return " ".join([deu_vocab.idx2word[i] for i in trg_indexes[1:]])

print("\n--- Çeviri Testleri ---")
test_sentences = ["go .", "come here .", "i am happy .", "he plays football .", "it is cold ."]
for s in test_sentences:
    print(f"Ing: {s:20} -> Alm: {translate_sentence(s)}")


--- Çeviri Testleri ---
Ing: go .                 -> Alm: geh weg !
Ing: come here .          -> Alm: komm hier herunter .
Ing: i am happy .         -> Alm: ich bin glücklich glücklich .
Ing: he plays football .  -> Alm: er will auf dem bett .
Ing: it is cold .         -> Alm: es ist kalt kalt .
