In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

import re
import numpy as np

import matplotlib.pyplot as plt

In [2]:
with open('haberler.txt', encoding='utf8') as f:
    
    txt_sentences = f.readlines()
    
new_list = ''.join(txt_sentences)

In [3]:
with open('turkce-stop-words', encoding='utf-8') as f:
    
    txt_stop_words = f.readlines()
    
stop_word_list = [i[:-1] for i in txt_stop_words]  # sondaki '\n' atmak için

In [4]:
def create_dataset(corpus, windows_size, stop_word_list):
    """
        corpus: metin
        windows_size: kaç komşu kelime
        embedding_size: kaç boyutlu vektör ile temsil edilecek
    """
    
    
    # remove punctuations and make lower corpus
    corpus = re.sub(r'\\n', ' ', corpus)  # \\n temizle
    corpus = re.sub(r'\n', ' ', corpus)  # \n temizle
    corpus = re.sub(r'[^\w\s]', ' ', corpus)  # word ve space hariç her şeyi at
    corpus = corpus.lower()  # küçük hale getir
    corpus = ' '.join([i for i in corpus.split() if i not in stop_word_list])  # remove stop words
    corpus = re.sub(r" \d+", " ", corpus)  # sayıları temizle
    corpus =  ' '.join([i for i in corpus.split() if len(i)>1])  # tek harfleri kaldır. (H.C(isim soyisim)-> h c -> anlamsız)
    corpus = re.sub(r' +', ' ', corpus)  # büyük space alanlarını tek space haline getir
    
    
    
    # word list
    corpus_word_list = corpus.split()
    corpus_word_list_len = len(corpus_word_list)
    
    
    # find number of unique value and assign a number to them
    unique_words = set(corpus_word_list)
    number_of_unique_words = len(unique_words)
    
    word_to_token_dict = {}
    token_to_word_dict = {}
    for token,word in enumerate(unique_words):
        word_to_token_dict[word] = token+1  # +1 olmasının sebebi 1 den başlasın, 0 padding için kullanılsın.
        token_to_word_dict[token+1] = word
    
    
    dataframe = []
    
    for i, word in enumerate(corpus_word_list):
        
        context_words = corpus_word_list[i-windows_size:i]+corpus_word_list[i+1:i+1+windows_size]
        context_words_token = [word_to_token_dict[word] for word in context_words]
        
        while len(context_words_token) < 4:  # padding
            context_words_token.append(0)
            
        dataframe.append([context_words_token, word_to_token_dict[word]])
        
        
    # örnek gösterim için
    print(' EXAMPLE : \n\n')
    print(' '.join(corpus_word_list[2:7]))
    print('\n --------------- \n')
    
    for i in range(4,5):
        print(str([token_to_word_dict[a] for a in dataframe[i][0]])+ ' --> ' +str(token_to_word_dict[dataframe[i][1]]) +'\n')
        print(str([dataframe[i][0]])+ ' --> ' +str(dataframe[i][1]) +'\n\n')
        
    print(f"\nnumber_of_unique_words : {number_of_unique_words}\n")
        
    dataframe = [[torch.tensor(i), torch.tensor(y)] for i,y in dataframe]  # convert dataframe elemnt to tensor (this is list of tensors)
    
    context_list = [i for i, y in dataframe]  # split context and targets
    target_list = [y for i, y in dataframe]
    
    context_tensor = torch.stack(context_list)
    target_tensor = torch.stack(target_list)
    
    dataset = TensorDataset(context_tensor,target_tensor)  # create dataset and dataloader
    dataloader = DataLoader(dataset) 
    
    return dataset, number_of_unique_words, word_to_token_dict, token_to_word_dict


In [5]:
class CBOW_Model(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, windows_size):
        super(CBOW_Model, self).__init__()
        
        self.embedding_layer = nn.Embedding(num_embeddings+1, embedding_dim)  # num_embeddings: kaç farklı kelime var, embedding_dim: vector kaç boyutlu
        self.fc1 = nn.Linear(embedding_dim*windows_size*2,128)  # 5*2*2 = 20 boyulu giriş (bu örnek için 20) # hidden layer
        self.fc2 = nn.Linear(128, num_embeddings+1)  # output layer  # çıktı output unique sayısı kadar olmalı
        
        
    def forward(self, x_in):
        out = self.embedding_layer(x_in).view(1,-1).squeeze(0)  # 5 boyutlu 4 vektörden 20 boyutlu tek vektöre
        out = self.fc1(out)  # 5 boyutlu çıkış
        out = F.relu(out)
        out = self.fc2(out)
        out = F.log_softmax(out, dim=-1)

        return out
    
    def get_word_vector(self, word_tokenize):
        word = (torch.LongTensor([word_tokenize]))
        return self.embedding_layer(word).view(1, -1)


In [7]:
""" TRAINING CELL  """

with open('uzun_haberler.txt', encoding='utf8') as f:
    
    txt_sentences = f.readlines()
    
corpus = ''.join(txt_sentences)

corpus = corpus[:50000] # çok uzun sürüyor 

windows_size = 2

dataframe, number_of_unique_words, word_to_token_dict, token_to_word_dict = create_dataset(corpus, windows_size, stop_word_list)

train_set, val_set = torch.utils.data.random_split(dataframe, [int(len(dataframe) * 0.7), len(dataframe) - int(len(dataframe) * 0.7)])  # %70 train
val_set, test_set = torch.utils.data.random_split(val_set, [int(len(val_set) * 0.5), len(val_set) - int(len(val_set) * 0.5)])  # %15 val, %15 test

model = CBOW_Model(num_embeddings=number_of_unique_words, embedding_dim=5, windows_size=windows_size)


n_epochs = 6
loss_f = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
    

counter = 0
for epoch in range(1, n_epochs+1):

    all_losses = []
    all_losses_val = []

    for context, target in train_set:

        pred = model(context)
        loss = loss_f(pred.unsqueeze(0), target.unsqueeze(0))

        all_losses.append(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    for context_val, target_val in val_set: 
        
        with torch.no_grad():
        
            pred_val = model(context_val)

            loss_val = loss_f(pred_val.unsqueeze(0), target_val.unsqueeze(0))

            all_losses_val.append(loss_val)
        
    
    print(epoch)
    print(f"Epoch: {epoch}, Loss: {loss}, Validation Loss: {loss_val}")
    
    
    if epoch > 5: 
        if counter > 5:
            print('--- Early Stop ---')
            break
            
        if loss_val >= loss_val_backup:
            counter +=1
        else:
            counter = 0
            
        loss_val_backup = loss_val
        
    else:
        loss_val_backup = loss_val
    

""" ------  """

 EXAMPLE : 


dolar tl üçüncü çeyrekte görecek

 --------------- 

['dolar', 'tl', 'çeyrekte', 'görecek'] --> üçüncü

[[1958, 528, 2226, 540]] --> 1470



number_of_unique_words : 2889

1
Epoch: 1, Loss: 6.8381524085998535, Validation Loss: 8.414573669433594
2
Epoch: 2, Loss: 7.466350555419922, Validation Loss: 7.775332450866699
3
Epoch: 3, Loss: 7.577645301818848, Validation Loss: 6.829939842224121
4
Epoch: 4, Loss: 7.8071208000183105, Validation Loss: 5.842516899108887
5
Epoch: 5, Loss: 7.9753618240356445, Validation Loss: 5.209146499633789
6
Epoch: 6, Loss: 7.877024173736572, Validation Loss: 6.623602390289307


' ------  '

In [18]:
print(f"model kelimeleri tamamen rasgele bir şekilde seçse alacağı sonuç : % {(1/number_of_unique_words)*100}")

model kelimeleri tamamen rasgele bir şekilde seçse alacağı sonuç : % 0.034614053305642094


In [8]:
""" TESTING MODEL ACCURACY """

""" Doğru kelimeyi kaç kere bildi."""

total_true = 0

for context_test, target_test in test_set:
    
    with torch.no_grad():
    
        pred = model(context_test)

        if bool(torch.argmax(pred) == target_test):

            total_true +=1

        acc = (total_true / len(test_set)) * 100
        
    
print(f"Test Accuracy : ½ {acc}")

Test Accuracy : ½ 2.5974025974025974


In [9]:
deneme_cumle = torch.tensor([word_to_token_dict['belediye'], word_to_token_dict['başkan'], word_to_token_dict['üye'], word_to_token_dict['parti']])
pred_percentage = model(deneme_cumle)
token_to_word_dict[int(torch.argmax(pred_percentage))]

'takım'

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

word1 = model.get_word_vector(word_to_token_dict['parti']).detach().numpy()
word2 = model.get_word_vector(word_to_token_dict['üye']).detach().numpy()

print(word1)
print(word2)

print(f"\nbenzerlik : {cosine_similarity(word1,word2)[0][0]}")

[[-0.71029663  0.02827632  0.48388532  0.7006433   1.5753489 ]]
[[-1.0905299  -0.3770474  -1.4081682   2.6060262   0.40407977]]

benzerlik : 0.41220712661743164
