In [42]:
import string

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize

In [43]:
# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abdul.muhmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [44]:
# Load dataset
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    return text

In [45]:
# Function to map POS tags to WordNet format
def get_wordnet_pos(nltk_pos):
    if nltk_pos.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos.startswith('V'):
        return wordnet.VERB
    elif nltk_pos.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default noun


In [46]:
# Tokenization and Sequence Generation
def preprocess_text(text, sequence_length=5):


    # Tokenization
    tokenized_words = word_tokenize(text)

    # Initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get Stopwords and Punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Apply stopwords, punctuation removal and lemmatization
    words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in pos_tag(tokenized_words)
        if word not in stop_words and word not in punctuation
    ]
    
    word_counts = Counter(words)
    vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}
    vocab['<UNK>'] = len(vocab) + 1
    vocab_size = len(vocab)

    sequences = []
    for i in range(len(words) - sequence_length):
        seq = words[i: i + sequence_length]
        sequences.append([ vocab.get(word, vocab['<UNK>']) for word in seq])

    return sequences, vocab, vocab_size

