In [11]:
import string

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize

In [12]:
# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
# Load dataset
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

In [14]:
# Function to map POS tags to WordNet format
def get_wordnet_pos(nltk_pos):
    if nltk_pos.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos.startswith('V'):
        return wordnet.VERB
    elif nltk_pos.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default noun


In [15]:
# Tokenization and Sequence Generation
def preprocess_text(text, sequence_length=5):


    # Tokenization
    tokenized_words = word_tokenize(text)

    # Initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get Stopwords and Punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Apply stopwords, punctuation removal and lemmatization
    words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in pos_tag(tokenized_words)
        if word not in stop_words and word not in punctuation
    ]
    
    word_counts = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}
    vocab['<UNK>'] = len(vocab) + 1
    vocab_size = len(vocab)

    sequences = []
    for i in range(len(words) - sequence_length):
        seq = words[i: i + sequence_length]
        sequences.append([ vocab.get(word, vocab['<UNK>']) for word in seq])

    return sequences, vocab, vocab_size



In [16]:
# custom Dataset class

class TextDataset(Dataset):
    def __init__(self, sequences):
        self.x = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
        self.y = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


In [17]:
# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

In [None]:
# Training Function
def train_model(model, dataloader, vocab_size, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


In [19]:
# Prediction Function
def predict(model, text, vocab, sequence_length=5):
    model.eval() #
    words = word_tokenize(text.lower())
    sequence = [vocab.get(word, vocab['<UNK>']) for word in words][-sequence_length:]
    input_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()
    
    inv_vocab = {idx: word for word, idx in vocab.items()}
    return inv_vocab.get(predicted_idx, '<UNK>')

In [20]:
# Main Execution
filepath = r"C:\Users\abdul.muhmin\Downloads\Sherlock_Holmes.txt"
text = load_data(filepath)
sequences, vocab, vocab_size = preprocess_text(text)
dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [21]:
# Initialize and train model
model = LSTMModel(vocab_size)
train_model(model, dataloader, vocab_size)

Epoch 1, Loss: 7.2798
Epoch 2, Loss: 6.5072
Epoch 3, Loss: 5.5631
Epoch 4, Loss: 4.5442
Epoch 5, Loss: 3.6782
Epoch 6, Loss: 2.9872
Epoch 7, Loss: 2.4161
Epoch 8, Loss: 1.9297
Epoch 9, Loss: 1.5149
Epoch 10, Loss: 1.1642


In [25]:

# Test Prediction
input_text = "The Man with"
predicted_word = predict(model, input_text, vocab)
print(f"Predicted completion: {predicted_word}")


IndexError: index out of range in self

In [26]:
import string
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load dataset
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

# Function to map POS tags to WordNet format
def get_wordnet_pos(nltk_pos):
    if nltk_pos.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos.startswith('V'):
        return wordnet.VERB
    elif nltk_pos.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default noun

# Tokenization and Sequence Generation
def preprocess_text(text, sequence_length=5):
    # Tokenization
    tokenized_words = word_tokenize(text)

    # Initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get Stopwords and Punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Apply stopwords, punctuation removal, and lemmatization
    words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in pos_tag(tokenized_words)
        if word not in stop_words and word not in punctuation
    ]
    
    word_counts = Counter(words)
    
    # Fix: Ensure vocabulary indexing starts from 0
    vocab = {word: i for i, (word, _) in enumerate(word_counts.most_common())}
    vocab['<UNK>'] = len(vocab)  # Last index
    vocab_size = len(vocab)

    sequences = []
    for i in range(len(words) - sequence_length):
        seq = words[i: i + sequence_length]
        sequences.append([vocab.get(word, vocab['<UNK>']) for word in seq])

    return sequences, vocab, vocab_size

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.x = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
        self.y = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Take last output of LSTM
        return x

# Training Function
def train_model(model, dataloader, vocab_size, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

# Prediction Function
def predict(model, text, vocab, sequence_length=5):
    model.eval()

    # Tokenize input text
    words = word_tokenize(text.lower())

    # Remove words not in vocab
    filtered_words = [word for word in words if word in vocab]
    if not filtered_words:
        print("Error: None of the words in input_text exist in the vocabulary!")
        return "<UNK>"

    # Convert words to indices
    sequence = [vocab.get(word, vocab['<UNK>']) for word in filtered_words][-sequence_length:]
    
    # Debugging outputs
    print("Tokenized Input:", words)
    print("Filtered Words:", filtered_words)
    print("Mapped Sequence:", sequence)
    
    input_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
    print("Input Tensor Shape:", input_tensor.shape)  # Ensure correct shape

    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()
    
    inv_vocab = {idx: word for word, idx in vocab.items()}
    return inv_vocab.get(predicted_idx, '<UNK>')

# Main Execution
filepath = r"C:\Users\abdul.muhmin\Downloads\Sherlock_Holmes.txt"
text = load_data(filepath)
sequences, vocab, vocab_size = preprocess_text(text)
dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Initialize and train model
model = LSTMModel(vocab_size)
train_model(model, dataloader, vocab_size)

# Test Prediction
input_text = "The Man with"
predicted_word = predict(model, input_text, vocab)
print(f"Predicted completion: {predicted_word}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Epoch 1, Loss: 7.2657
Epoch 2, Loss: 6.5086
Epoch 3, Loss: 5.5908
Epoch 4, Loss: 4.5808
Epoch 5, Loss: 3.7023
Epoch 6, Loss: 3.0016
Epoch 7, Loss: 2.4280
Epoch 8, Loss: 1.9403
Epoch 9, Loss: 1.5229
Epoch 10, Loss: 1.1711
Tokenized Input: ['the', 'man', 'with']
Filtered Words: ['man']
Mapped Sequence: [10]
Input Tensor Shape: torch.Size([1, 1])
Predicted completion: 's


In [38]:
# Test Prediction
input_text = "THE ADVENTURES OF SHERLOCK "
predicted_word = predict(model, input_text, vocab)
print(f"Predicted completion: {predicted_word}")


Tokenized Input: ['the', 'adventures', 'of', 'sherlock']
Filtered Words: ['sherlock']
Mapped Sequence: [62]
Input Tensor Shape: torch.Size([1, 1])
Predicted completion: holmes
