In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
from torchinfo import summary

In [2]:
class TextDataset(Dataset):
    def __init__(self, file_path, seq_length, min_word_freq=2):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        self.words = self.tokenize(text)
        word_counts = Counter(self.words)
        
        # Create vocabulary with words appearing at least min_word_freq times
        self.word_to_ix = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_word_freq}
        self.word_to_ix['<UNK>'] = len(self.word_to_ix)  # Add unknown token
        self.ix_to_word = {i: word for word, i in self.word_to_ix.items()}
        
        self.data = [self.word_to_ix.get(w, self.word_to_ix['<UNK>']) for w in self.words]
        self.seq_length = seq_length

        # Debug: Print vocabulary size and a few samples
        print(f"Vocabulary size: {len(self.word_to_ix)}")
        print(f"Sample data indices: {self.data[:10]}")

    def tokenize(self, text):
        return re.findall(r'\w+', text.lower())

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.data[index:index+self.seq_length]),
            torch.tensor(self.data[index+1:index+self.seq_length+1])
        )


In [3]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

In [4]:
def train_model(model, dataloader, num_epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_inputs, batch_targets in dataloader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_inputs)
            
            # Debugging: Print the shape of outputs and batch_targets
            # print(f"Outputs shape: {outputs.shape}")
            # print(f"Batch targets shape: {batch_targets.shape}")
            
            loss = criterion(outputs.view(-1, len(dataset.word_to_ix)), batch_targets.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

    return model

In [5]:
def generate_autocomplete(model, input_text, word_to_ix, ix_to_word, device, num_suggestions=3):
    model.eval()
    words = input_text.lower().split()
    input_seq = [word_to_ix.get(word, word_to_ix['<UNK>']) for word in words]
    input_tensor = torch.tensor(input_seq).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(input_tensor)
        probabilities = torch.softmax(output[0, -1], dim=0)
        top_indices = torch.topk(probabilities, num_suggestions).indices.tolist()
    
    suggestions = [ix_to_word[idx] for idx in top_indices]
    return suggestions

In [6]:
FILE_PATH = "data_extended.txt"
SEQ_LENGTH = 5
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
BATCH_SIZE = 32
NUM_EPOCHS = 100
LEARNING_RATE = 0.001
MIN_WORD_FREQ = 0

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

dataset = TextDataset(FILE_PATH, SEQ_LENGTH, MIN_WORD_FREQ)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

vocab_size = len(dataset.word_to_ix)
print(f"Vocabulary size: {vocab_size}")

model = LanguageModel(vocab_size, EMBEDDING_DIM, HIDDEN_DIM)
summary(model, input_size=(BATCH_SIZE, SEQ_LENGTH), dtypes=[torch.long])

Using device: cpu
Vocabulary size: 1065
Sample data indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Vocabulary size: 1065


Layer (type:depth-idx)                   Output Shape              Param #
LanguageModel                            [32, 5, 1065]             --
├─Embedding: 1-1                         [32, 5, 100]              106,500
├─LSTM: 1-2                              [32, 5, 256]              366,592
├─Linear: 1-3                            [32, 5, 1065]             273,705
Total params: 746,797
Trainable params: 746,797
Non-trainable params: 0
Total mult-adds (M): 70.82
Input size (MB): 0.00
Forward/backward pass size (MB): 1.82
Params size (MB): 2.99
Estimated Total Size (MB): 4.81

In [8]:
trained_model = train_model(model, dataloader, NUM_EPOCHS, LEARNING_RATE, device)
torch.save(trained_model.state_dict(), "autocomplete_model.pth")

Epoch 1/100, Loss: 6.4788
Epoch 2/100, Loss: 4.8638
Epoch 3/100, Loss: 3.1513
Epoch 4/100, Loss: 1.9733
Epoch 5/100, Loss: 1.3504
Epoch 6/100, Loss: 1.0215
Epoch 7/100, Loss: 0.8272
Epoch 8/100, Loss: 0.6981
Epoch 9/100, Loss: 0.6071
Epoch 10/100, Loss: 0.5402
Epoch 11/100, Loss: 0.4902
Epoch 12/100, Loss: 0.4566
Epoch 13/100, Loss: 0.4311
Epoch 14/100, Loss: 0.4118
Epoch 15/100, Loss: 0.3984
Epoch 16/100, Loss: 0.3880
Epoch 17/100, Loss: 0.3814
Epoch 18/100, Loss: 0.3754
Epoch 19/100, Loss: 0.3719
Epoch 20/100, Loss: 0.3653
Epoch 21/100, Loss: 0.3619
Epoch 22/100, Loss: 0.3603
Epoch 23/100, Loss: 0.3578
Epoch 24/100, Loss: 0.3584
Epoch 25/100, Loss: 0.3557
Epoch 26/100, Loss: 0.3525
Epoch 27/100, Loss: 0.3510
Epoch 28/100, Loss: 0.3503
Epoch 29/100, Loss: 0.3502
Epoch 30/100, Loss: 0.3512
Epoch 31/100, Loss: 0.3493
Epoch 32/100, Loss: 0.3482
Epoch 33/100, Loss: 0.3468
Epoch 34/100, Loss: 0.3463
Epoch 35/100, Loss: 0.3470
Epoch 36/100, Loss: 0.3455
Epoch 37/100, Loss: 0.3454
Epoch 38/1

In [20]:
test_inputs = [
    "nasıl",
]

trained_model.to(device)
for input_text in test_inputs:
    suggestions = generate_autocomplete(trained_model, input_text, dataset.word_to_ix, dataset.ix_to_word, device)
    print(f"Input: {input_text}")
    print(f"Suggestions: {', '.join(suggestions)}\n")

Input: nasıl
Suggestions: gidebilirim, bir, yardımcı

