In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = F.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)
        return output, attention_weights
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        Q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        attn_output, _ = self.scaled_dot_product_attention(Q, K, V, mask)
        
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, -1, self.d_model
        )
        
        output = self.W_o(attn_output)
        return output


class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        
        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        
        return x


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class WordGenerator(nn.Module):
    """Kelime üreten decoder modeli"""
    def __init__(self, vocab_size, d_model=128, num_heads=4, 
                 num_layers=3, d_ff=512, dropout=0.1, max_len=100):
        super().__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def generate_square_subsequent_mask(self, sz, device):
        mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()
        return ~mask
    
    def forward(self, x, mask=None):
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        if mask is None:
            seq_len = x.size(1)
            mask = self.generate_square_subsequent_mask(seq_len, x.device)
            mask = mask.unsqueeze(0).unsqueeze(0)
        
        for layer in self.layers:
            x = layer(x, mask)
        
        output = self.fc_out(x)
        return output


class WordDataset:
    """kelimleri alir ve vocabulary oluturur"""
    def __init__(self, words):
        self.words = words
        
        chars = set(''.join(words)) #kullanilan tum karakterleri bul ve sozluge ekle
        self.chars = sorted(list(chars))#karakterleri sirala ve properties olarak kaydet
        
        # Ozel tokenlari ekle padding start ve end
        self.char_to_idx = {'<PAD>': 0, '<START>': 1, '<END>': 2}
        for i, c in enumerate(self.chars):
            self.char_to_idx[c] = i + 3 #her bir karakteri charto idxe ekle ve her karaktere sayisa bir deger ekle a:3 b:4 gibi
        
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()} #key value lari tesrcevir
        self.vocab_size = len(self.char_to_idx) #vocabulerity size hesap eder
        
        print(f"Toplam kelime: {len(words)}")
        print(f"Benzersiz karakter: {len(self.chars)}")
        print(f"Vocabulary boyutu: {self.vocab_size}")
        print(f"Karakterler: {self.chars}")
    
    def encode_word(self, word):
        """Kelimeyi index numaralrina gore sayisallastariri"""
        return [self.char_to_idx['<START>']] + \
               [self.char_to_idx[c] for c in word] + \
               [self.char_to_idx['<END>']]
    
    def decode_indices(self, indices):
        """index listesini kelimeye cevirir"""
        chars = []
        for idx in indices:
            if idx == self.char_to_idx['<END>']:
                break
            if idx != self.char_to_idx['<START>'] and idx != self.char_to_idx['<PAD>']:
                chars.append(self.idx_to_char[idx])
        return ''.join(chars)
    
    def create_batches(self, batch_size=32):
        """egitim batchleri olusturur her bit tensoru ayni boyuta getirir"""
        random.shuffle(self.words)
        batches = []
        
        for i in range(0, len(self.words), batch_size):
            batch_words = self.words[i:i+batch_size]
            
            # encode
            encoded = [self.encode_word(w) for w in batch_words]
            
            # padding
            max_len = max(len(seq) for seq in encoded)
            padded = []
            for seq in encoded:
                padded.append(seq + [self.char_to_idx['<PAD>']] * (max_len - len(seq)))
            
            # input target
            input_seqs = [seq[:-1] for seq in padded]  # Son harf haric
            target_seqs = [seq[1:] for seq in padded]  # İlk harf haric
            
            batches.append((
                torch.tensor(input_seqs, dtype=torch.long),
                torch.tensor(target_seqs, dtype=torch.long)
            ))
        
        return batches


def train_model(model, dataset, device, epochs=50, batch_size=32, lr=0.001):
    """Meodeli Egitir"""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=dataset.char_to_idx['<PAD>'])
    
    print(f"\n{'='*50}")
    print(f"Egitim Basldai... Device: {device}")
    print(f"{'='*50}\n")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        batches = dataset.create_batches(batch_size)
        
        for input_seq, target_seq in batches:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)
            
            optimizer.zero_grad()
            
            output = model(input_seq)
            
            # Loss hesapla
            loss = criterion(output.view(-1, model.vocab_size), target_seq.view(-1))
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(batches)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")
            # ornek kelime uret
            sample_words = generate_words(model, dataset, device, num_words=3, max_len=15)
            print(f"ornek Kelimeler: {sample_words}\n")
    
    print("Egitim Tamamlandi\n")


def generate_words(model, dataset, device, num_words=5, max_len=20, temperature=1.0):
    '''Kelimler uretir eval modunda egitim yapmaz gradien duzenleme yapmaz calisma modu'''
    model.eval()
    generated_words = []
    
    with torch.no_grad():
        for _ in range(num_words):
            # <START> token baslar
            current_seq = [dataset.char_to_idx['<START>']]
            
            for _ in range(max_len):
                input_tensor = torch.tensor([current_seq], dtype=torch.long).to(device)
                output = model(input_tensor)
                
                # son karakterlerin olasiliklari
                logits = output[0, -1, :] / temperature
                probs = F.softmax(logits, dim=0)
                
                # ornekleme yap
                next_idx = torch.multinomial(probs, 1).item()
                
                # <END> veya <PAD> gelirse dur
                if next_idx == dataset.char_to_idx['<END>'] or \
                   next_idx == dataset.char_to_idx['<PAD>']:
                    break
                
                current_seq.append(next_idx)
            
            # decode et
            word = dataset.decode_indices(current_seq)
            if word:  # bos degilse ekle
                generated_words.append(word)
    
    return generated_words


if __name__ == "__main__":
    # Ornek Kelime isimler isim secilmesinin nedeni fakli dil ailelerine mensup olmalari ve farkli frekans sikligi olmasi bunu ogrenirse herseyi ogrenir
    words = [
        "ahmet", "mehmet", "ali", "veli", "ayşe", "fatma", "zeynep", "elif",
        "mustafa", "ibrahim", "hasan", "hüseyin", "emre", "cem", "can",
        "deniz", "ege", "mert", "yusuf", "ömer", "selim", "kerem", "berk",
        "ada", "ece", "selin", "defne", "buse", "esra", "seda", "pelin",
        "burak", "onur", "kaan", "baran", "eren", "arda", "alp", "doruk"
    ]
    
    # Device sec
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Kullanilan Cihaz: {device}\n")
    
    # Dataset olustur
    dataset = WordDataset(words)
    
    # Model olustur
    model = WordGenerator(
        vocab_size=dataset.vocab_size,
        d_model=128,
        num_heads=4,
        num_layers=3,
        d_ff=512,
        dropout=0.1
    )
    
    print(f"\nModel parametreleri: {sum(p.numel() for p in model.parameters()):,}")
    
    # Modeli egit
    train_model(model, dataset, device, epochs=100, batch_size=16, lr=0.001)
    
    # Yeni kelimeler uret
    print(f"{'='*50}")
    print("YENI KELIMELER URETILIYOR...")
    print(f"{'='*50}\n")
    
    for i in range(10):
        new_words = generate_words(model, dataset, device, num_words=100, max_len=15)
        print(f"Set {i+1}: {', '.join(new_words)}")
    
    print("\Bitti !")

Kullanilan Cihaz: cuda

Toplam kelime: 39
Benzersiz karakter: 25
Vocabulary boyutu: 28
Karakterler: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'y', 'z', 'ö', 'ü', 'ş']

Model parametreleri: 602,012

Egitim Basldai... Device: cuda

Epoch 10/100 - Loss: 1.4935
ornek Kelimeler: ['eynin', 'eyi', 'şesra']

Epoch 20/100 - Loss: 0.9527
ornek Kelimeler: ['met', 'barak', 'alp']

Epoch 30/100 - Loss: 0.8752
ornek Kelimeler: ['kerem', 'ege', 'burahüse']

Epoch 40/100 - Loss: 0.8236
ornek Kelimeler: ['burak', 'ada', 'arda']

Epoch 50/100 - Loss: 0.7463
ornek Kelimeler: ['ömer', 'mehmet', 'burak']

Epoch 60/100 - Loss: 0.7641
ornek Kelimeler: ['ömer', 'selin', 'zeynep']

Epoch 70/100 - Loss: 0.7356
ornek Kelimeler: ['mert', 'ece', 'cem']

Epoch 80/100 - Loss: 0.7337
ornek Kelimeler: ['arda', 'deniz', 'selin']

Epoch 90/100 - Loss: 0.7163
ornek Kelimeler: ['veli', 'emre', 'mert']

Epoch 100/100 - Loss: 0.7275
ornek Kelimeler: ['ömer', 'ahmet'