In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from collections import Counter
import string


nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

def preprocess_text(text, sequence_length=5):
    tokenized_words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    
    words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in pos_tag(tokenized_words)
        if word not in stop_words and word not in punctuation
    ]
    
    word_counts = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}
    vocab['<UNK>'] = len(vocab) + 1
    vocab_size = len(vocab)
    
    sequences = []
    for i in range(len(words) - sequence_length):
        seq = words[i: i + sequence_length]
        sequences.append([vocab.get(word, vocab['<UNK>']) for word in seq])
    
    return sequences, vocab, vocab_size

class TextDataset(Dataset):
    def __init__(self, sequences):
        self.x = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
        self.y = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

def train_model(model, dataloader, vocab_size, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

def predict(model, text, vocab, sequence_length=5):
    model.eval()
    words = word_tokenize(text.lower())
    sequence = [vocab.get(word, vocab['<UNK>']) for word in words][-sequence_length:]
    input_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()
    
    inv_vocab = {idx: word for word, idx in vocab.items()}
    return inv_vocab.get(predicted_idx, '<UNK>')

# Load and process data
filepath = r"C:\Users\abdul.muhmin\Downloads\holmes.txt"
text = load_data(filepath)
sequences, vocab, vocab_size = preprocess_text(text)

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Train the model
model = LSTMModel(vocab_size)
train_model(model, dataloader, vocab_size)

# Predict a word
input_text = "the detective found"
predicted_word = predict(model, input_text, vocab)
print(f"Predicted completion: {predicted_word}")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from collections import Counter
import string

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return 'n'

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text


In [None]:

def preprocess_text(text, sequence_length=5):
    tokenized_words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    
    words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in pos_tag(tokenized_words)
        if word not in stop_words and word not in punctuation
    ]
    
    word_counts = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}
    vocab['<UNK>'] = len(vocab) + 1
    vocab_size = len(vocab)
    
    sequences = []
    for i in range(len(words) - sequence_length):
        seq = words[i: i + sequence_length]
        sequences.append([vocab.get(word, vocab['<UNK>']) for word in seq])
    
    return sequences, vocab, vocab_size

class TextDataset(Dataset):
    def __init__(self, sequences):
        self.x = torch.tensor([seq[:-1] for seq in sequences], dtype=torch.long)
        self.y = torch.tensor([seq[-1] for seq in sequences], dtype=torch.long)
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

def train_model(model, dataloader, vocab_size, epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

def predict(model, text, vocab, sequence_length=5):
    model.eval()
    words = word_tokenize(text.lower())
    sequence = [vocab.get(word, vocab['<UNK>']) for word in words][-sequence_length:]
    input_tensor = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()
    
    inv_vocab = {idx: word for word, idx in vocab.items()}
    return inv_vocab.get(predicted_idx, '<UNK>')

In [None]:
# Load and process data
filepath = r"C:\Users\abdul.muhmin\Downloads\holmes.txt"
text = load_data(filepath)
sequences, vocab, vocab_size = preprocess_text(text)

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
words = ["apple", "banana", "cherry", "date","apple", "banana", "cherry", "date"]
sequence_length = 4

for i in range(len(words) - sequence_length):
    print(words[i: i + sequence_length])


['apple', 'banana', 'cherry', 'date']
['banana', 'cherry', 'date', 'apple']
['cherry', 'date', 'apple', 'banana']
['date', 'apple', 'banana', 'cherry']


In [None]:
import torch

sequence = [0, 0, 0, 5]  # A list of word indices
tensor_sequence = torch.tensor(sequence, dtype=torch.long)
print(tensor_sequence.shape)


torch.Size([4])


In [None]:

input_tensor = tensor_sequence.unsqueeze(0)
print(input_tensor.shape)


torch.Size([1, 4])


In [None]:
output = torch.tensor([79,2575,1314])
predicted_idx=torch.argmax(output).item()
print(predicted_idx)

1
