In [6]:
import requests
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
import nltk
import warnings
url = "https://www.gutenberg.org/files/11/11-0.txt"
alice_text = requests.get(url).text

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

alice_tokens = preprocess_text(alice_text)
alice_pos_tags = pos_tag(alice_tokens)
print(alice_pos_tags[:20])

[('ï', 'JJ'), ('START', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('PROJECT', 'NNP'), ('GUTENBERG', 'NNP'), ('EBOOK', 'NNP'), ('ALICE', 'NNP'), ('ADVENTURES', 'NNP'), ('IN', 'NNP'), ('WONDERLAND', 'NNP'), ('Illustration', 'NNP'), ('Adventures', 'NNP'), ('Wonderland', 'NNP'), ('Lewis', 'NNP'), ('Carroll', 'NNP'), ('THE', 'NNP'), ('MILLENNIUM', 'NNP'), ('FULCRUM', 'NNP'), ('EDITION', 'NNP')]


In [7]:
import random

def create_dataset(pos_tags, n_context=3):
    X = []
    Y = []

    for i in range(n_context, len(pos_tags) - n_context):
        context = [pos_tags[j][0] for j in range(i-n_context, i+n_context+1) if j != i]
        target = pos_tags[i][1]

        X.append(context)
        Y.append(target)

    return X, Y

X, Y = create_dataset(alice_pos_tags)

train_size = int(len(X) * 0.8)
train_X, train_Y = X[:train_size], Y[:train_size]
val_X, val_Y = X[train_size:], Y[train_size:]

train_data = list(zip(train_X, train_Y))
random.shuffle(train_data)
train_X, train_Y = zip(*train_data)

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

sequence_length = 7
sequences = []
for i in range(sequence_length, len(alice_pos_tags) - sequence_length + 1):
    seq = alice_pos_tags[i-sequence_length:i+sequence_length]
    sequences.append(seq)

words = [[w[0] for w in seq] for seq in sequences]
tags = [seq[3][1] for seq in sequences]

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(words)
word_sequences = word_tokenizer.texts_to_sequences(words)

unique_tags = list(set(tags))
tag_index = {tag: i for i, tag in enumerate(unique_tags)}
tags_encoded = [tag_index[tag] for tag in tags]
tags_encoded = to_categorical(tags_encoded, num_classes=len(unique_tags))

max_seq_length = max(len(seq) for seq in word_sequences)
word_sequences_padded = pad_sequences(word_sequences, maxlen=max_seq_length, padding='post')

X_train, X_val, y_train, y_val = train_test_split(word_sequences_padded, tags_encoded, test_size=0.2, random_state=42)




In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed
vocab_size = len(word_tokenizer.word_index) + 1
embed_size = 128
lstm_units = 96
num_classes = y_train.shape[1]
model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=max_seq_length))
model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=15, batch_size=32)

loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Accuracy: {accuracy}')


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Validation Accuracy: 0.6342880725860596
