In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu' ) 

class WordDataset:
    """kelimleri alir ve vocabulary oluturur"""
    def __init__(self, words):
        self.words = words
        
        chars = set(''.join(words)) #kullanilan tum karakterleri bul ve sozluge ekle
        self.chars = sorted(list(chars))#karakterleri sirala ve properties olarak kaydet
        
        # Ozel tokenlari ekle padding start ve end
        self.char_to_idx = {'<PAD>': 0, '<START>': 1, '<END>': 2}
        for i, c in enumerate(self.chars):
            self.char_to_idx[c] = i + 3 #her bir karakteri charto idxe ekle ve her karaktere sayisa bir deger ekle a:3 b:4 gibi
        
        self.idx_to_char = {v: k for k, v in self.char_to_idx.items()} #key value lari tesrcevir
        self.vocab_size = len(self.char_to_idx) #vocabulerity size hesap eder
        
        print(f"Toplam kelime: {len(words)}")
        print(f"Benzersiz karakter: {len(self.chars)}")
        print(f"Vocabulary boyutu: {self.vocab_size}")
        print(f"Karakterler: {self.chars}")
    
    def encode_word(self, word):
        """Kelimeyi index numaralrina gore sayisallastariri"""
        return [self.char_to_idx['<START>']] + \
               [self.char_to_idx[c] for c in word] + \
               [self.char_to_idx['<END>']]
    
    def decode_indices(self, indices):
        """index listesini kelimeye cevirir"""
        chars = []
        for idx in indices:
            if idx == self.char_to_idx['<END>']:
                break
            if idx != self.char_to_idx['<START>'] and idx != self.char_to_idx['<PAD>']:
                chars.append(self.idx_to_char[idx])
        return ''.join(chars)
    
    def create_batches(self, batch_size=32):
        """egitim batchleri olusturur her bit tensoru ayni boyuta getirir"""
        #self.rng.shuffle(self.words)
        random.shuffle(self.words)
        batches = []
        
        for i in range(0, len(self.words), batch_size):
            batch_words = self.words[i:i+batch_size]
            
            # encode
            encoded = [self.encode_word(w) for w in batch_words]
            
            # padding
            max_len = max(len(seq) for seq in encoded)
            padded = []
            for seq in encoded:
                padded.append(seq + [self.char_to_idx['<PAD>']] * (max_len - len(seq)))
            
            # input target
            input_seqs = [seq[:-1] for seq in padded]  # Son harf haric
            target_seqs = [seq[1:] for seq in padded]  # İlk harf haric
            
            batches.append((
                torch.tensor(input_seqs, dtype=torch.long),
                torch.tensor(target_seqs, dtype=torch.long)
            ))
        
        return batches




In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = F.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)
        return output, attention_weights
    
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        Q = self.W_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        attn_output, _ = self.scaled_dot_product_attention(Q, K, V, mask)
        
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, -1, self.d_model
        )
        
        output = self.W_o(attn_output)
        return output


class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.fc2(self.dropout(F.relu(self.fc1(x))))


class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        
        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        
        return x


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class WordGenerator(nn.Module):
    """Kelime üreten decoder modeli"""
    def __init__(self, vocab_size, d_model=128, num_heads=4, 
                 num_layers=3, d_ff=512, dropout=0.1, max_len=200):
        super().__init__()
        
        self.d_model = d_model
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def generate_square_subsequent_mask(self, sz, device):
        mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1).bool()
        return ~mask
    
    def forward(self, x, mask=None):
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        if mask is None:
            seq_len = x.size(1)
            mask = self.generate_square_subsequent_mask(seq_len, x.device)
            mask = mask.unsqueeze(0).unsqueeze(0)
        
        for layer in self.layers:
            x = layer(x, mask)
        
        output = self.fc_out(x)
        return output



In [3]:
with open("newdata.txt", "r", errors="ignore") as f:
        words = f.read().splitlines()
words = [w for w in words if 1 < len(w) < 60]
train  = words[0:round(0.8*len(words))]
test = words[round(0.2*len(words)):]
dataset = WordDataset(train)



Toplam kelime: 3999714
Benzersiz karakter: 196
Vocabulary boyutu: 199
Karakterler: ['\x08', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'É', 'Ë', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Õ', '×', 'Ø', 'Ù', 'Û', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'í', 'î', 'ï', 'ð', 'ñ', 'ó', 'ö', '÷', 'ø', 'ù', 'ú', 'ý', 'Œ', 'œ', 'Š', 'š', 'Ÿ', 'Ž', 'ž', 'ƒ', 'ˆ', '˜', '–', '—', '

In [4]:
each_batch = 500
maximum = int(5e+8)
step = maximum//each_batch
max_len = 15
temperature=1.0
results = {}

In [None]:
model = torch.load("generator_3.pt")
model = model["model"]

model.eval()
with torch.no_grad():
    for i in range(step):
        generated_words = []
        for _ in range(each_batch):
            current_seq = [dataset.char_to_idx['<START>']]
            for _ in range(max_len):
                input_tensor = torch.tensor([current_seq] , dtype=torch.long).to(device)
                output = model(input_tensor)
                logits = output[0,-1,:] / temperature
                probs = F.softmax(logits,dim=0)
                next_idx = torch.multinomial(probs, 1).item()
                if next_idx == dataset.char_to_idx['<END>'] or next_idx == dataset.char_to_idx['<PAD>']:
                    break
                current_seq.append(next_idx)
            word = dataset.decode_indices(current_seq)
            if word and word not in generated_words: 
                generated_words.append(word)
        count = sum (1 for x in generated_words if x in test)
        test = [x for x in test if x not in generated_words] 
        results[i+1] = count
        print(f"step : {(i+1)} - count {count} ")
        if len(test) == 0 :
            break


  model = torch.load("generator_3.pt")


step : 1 - count 12 
step : 2 - count 0 


KeyboardInterrupt: 

500000000


1000000.0
