In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nltk tensorflow




In [None]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.utils import to_categorical


In [None]:
# Download NLTK data
nltk.download('treebank')
nltk.download('universal_tagset')  # simplified POS tags


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
from nltk.corpus import treebank

# Load POS-tagged sentences (simplified tags)
tagged_sentences = treebank.tagged_sents(tagset='universal')[:2000]  # small subset

# Split words and tags
sentences = [[w for w, t in s] for s in tagged_sentences]
tags = [[t for w, t in s] for s in tagged_sentences]

print(sentences[0])
print(tags[0])


['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Word tokenizer
word_tokenizer = Tokenizer(lower=True, filters='')
word_tokenizer.fit_on_texts(sentences)
word_index = word_tokenizer.word_index
vocab_size = len(word_index) + 1

X_seq = word_tokenizer.texts_to_sequences(sentences)

# Tag tokenizer
tag_tokenizer = Tokenizer(lower=False, filters='')
tag_tokenizer.fit_on_texts(tags)
tag_index = tag_tokenizer.word_index
num_tags = len(tag_index) + 1

y_seq = tag_tokenizer.texts_to_sequences(tags)


In [None]:
max_len = max(len(s) for s in X_seq)
X = pad_sequences(X_seq, maxlen=max_len, padding='post')
y = pad_sequences(y_seq, maxlen=max_len, padding='post')

# Convert to one-hot for each tag
y = to_categorical(y, num_classes=num_tags)


In [None]:
embedding_dim = 64
lstm_units = 64

input = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(input)
x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
output = TimeDistributed(Dense(num_tags, activation='softmax'))(x)

model = Model(input, output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
model.fit(X, y, batch_size=32, epochs=5, validation_split=0.1)


Epoch 1/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 1s/step - accuracy: 0.8391 - loss: 0.7067 - val_accuracy: 0.9236 - val_loss: 0.2470
Epoch 2/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 0.9342 - loss: 0.2042 - val_accuracy: 0.9282 - val_loss: 0.2154
Epoch 3/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.9437 - loss: 0.1719 - val_accuracy: 0.9552 - val_loss: 0.1546
Epoch 4/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.9646 - loss: 0.1227 - val_accuracy: 0.9743 - val_loss: 0.0988
Epoch 5/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1s/step - accuracy: 0.9789 - loss: 0.0768 - val_accuracy: 0.9831 - val_loss: 0.0663


<keras.src.callbacks.history.History at 0x7f9c5f135820>

In [None]:
from nltk.corpus import treebank

# Take the first 500 sentences used in training
tagged_sentences = treebank.tagged_sents(tagset='universal')[:500]

# Extract plain words
train_sentences = [[w for w, t in s] for s in tagged_sentences]

# Pick one sentence
test_sentence = train_sentences[10]  # 11th sentence in training
print("Test sentence:", test_sentence)

Test sentence: ['Neither', 'Lorillard', 'nor', 'the', 'researchers', 'who', '*T*-3', 'studied', 'the', 'workers', 'were', 'aware', 'of', 'any', 'research', 'on', 'smokers', 'of', 'the', 'Kent', 'cigarettes', '.']


In [None]:
test_seq = word_tokenizer.texts_to_sequences([test_sentence])
test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

pred = model.predict(test_seq)
pred_tags = decode_tags(pred[0])

print("Sentence:", test_sentence)
print("Predicted POS tags:", pred_tags[:len(test_sentence)])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Sentence: ['Neither', 'Lorillard', 'nor', 'the', 'researchers', 'who', '*T*-3', 'studied', 'the', 'workers', 'were', 'aware', 'of', 'any', 'research', 'on', 'smokers', 'of', 'the', 'Kent', 'cigarettes', '.']
Predicted POS tags: ['NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'VERB', 'VERB', 'VERB', 'DET', 'NOUN', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', '.']


In [None]:
def decode_tags(seq):
    return [list(tag_index.keys())[np.argmax(vec)-1] if np.sum(vec)>0 else 'O' for vec in seq]

# Test sentence
test_sentence = ["She", "joins", "the","class"]
test_seq = word_tokenizer.texts_to_sequences([test_sentence])
test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

pred = model.predict(test_seq)
pred_tags = decode_tags(pred[0])

print("Sentence:", test_sentence)
print("Predicted POS tags:", pred_tags[:len(test_sentence)])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Sentence: ['She', 'joins', 'the', 'class']
Predicted POS tags: ['PRON', 'VERB', 'DET', 'NOUN']
