In [None]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder  # For label encoding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/POS/train.csv")
print(df_train.shape)

(5600, 2)


In [None]:
# tokenizer for sentences
tokens_ip = df_train['Sentence'].values
tokenizer_ip = Tokenizer()
tokenizer_ip.fit_on_texts(tokens_ip)
word_index_ip = tokenizer_ip.word_index

In [None]:
# tokeniszer for tags
tokens_op = df_train['Tags'].values
tokenizer_op = Tokenizer()
tokenizer_op.fit_on_texts(tokens_op)
word_index_op = tokenizer_op.word_index
max_sequence_length = 100

In [None]:
# function to preprocess input data
def preprocess(df):
  sentence = df['Sentence'].values
  tags = df['Tags'].values

  # Tokenize the text data
  seq_sent = tokenizer_ip.texts_to_sequences(sentence)
  seq_tags = tokenizer_op.texts_to_sequences(tags)

  # Pad the sequences to ensure uniform length
  max_sequence_length = max([len(x) for x in seq_sent ])

  input = pad_sequences(seq_sent, maxlen=max_sequence_length)
  desired_op = pad_sequences(seq_tags, maxlen=max_sequence_length)

  desired_op = np.stack([to_categorical(i, num_classes=len(tokenizer_op.word_index) + 1) for i in desired_op])
  return input, desired_op

In [None]:
# get GloVe
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-04-23 08:43:10--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-04-23 08:45:49 (5.17 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
embedding_dim = 200  # GloVe 200D embeddings
embeddings_index = {}

# Read the GloVe file and store the embeddings
with open('glove.6B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

In [None]:
# Making embedding matrix
embedding_matrix = np.zeros((len(word_index_ip) + 1, embedding_dim))
for word, i in word_index_ip.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# rrn object for pos

class RNNModel:
    def __init__(self, word_index, embedding_matrix, tag_tokenizer, embedding_dim=200, max_sequence_length=100):
        """
        Initializes the POS RNN model.

        word_index: The word index from a tokenizer (mapping words to integer indices)
        embedding_matrix: The pre-trained GloVe embeddings matrix
        tag_tokenizer: The tokenizer for POS tags (used for one-hot encoding the tags)
        embedding_dim: The dimension of the GloVe embeddings (default is 200)
        max_sequence_length: The maximum length of the sentences (default is 100)
        """
        self.word_index = word_index
        self.embedding_matrix = embedding_matrix
        self.embedding_dim = embedding_dim
        self.max_sequence_length = max_sequence_length
        self.tag_tokenizer = tag_tokenizer
        self.num_tags = len(tag_tokenizer.word_index) + 1  # Number of unique POS tags
        self.model = self.build_model()

    def build_model(self):


        model = Sequential()

        # Add the embedding layer with pre-trained GloVe embeddings
        model.add(Embedding(input_dim=len(self.word_index) + 1,
                            output_dim=self.embedding_dim,
                            weights=[self.embedding_matrix],
                            input_length=self.max_sequence_length,
                            trainable=False))  # Freezing the embeddings

        # Add the SimpleRNN layer
        model.add(SimpleRNN(25, return_sequences=True, dropout=0.0, recurrent_dropout=0.0))

        # Add the output layer (one-hot encoded POS tags)
        model.add(Dense(self.num_tags, activation='softmax'))

        # Compile the model
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        return model

    def get_model(self):
        """
        Returns the built model.
        """
        return self.model


In [None]:
rnn_model = RNNModel(word_index=tokenizer_ip.word_index,
                             embedding_matrix=embedding_matrix,
                             tag_tokenizer= tokenizer_op,
                             embedding_dim=200,
                             max_sequence_length=max_sequence_length)

# Get the model
pos_model = rnn_model.get_model()

#preprocess training data
input_train, desired_op_train = preprocess(df_train)

# train model
pos_model.fit(input_train, desired_op_train, epochs=10, batch_size=64)



Epoch 1/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - accuracy: 0.6499 - loss: 1.6918
Epoch 2/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.8921 - loss: 0.4471
Epoch 3/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.9088 - loss: 0.3615
Epoch 4/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9160 - loss: 0.3247
Epoch 5/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 48ms/step - accuracy: 0.9204 - loss: 0.2994
Epoch 6/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.9244 - loss: 0.2806
Epoch 7/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.9252 - loss: 0.2735
Epoch 8/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - accuracy: 0.9271 - loss: 0.2624
Epoch 9/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bc1d0b60850>

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/POS/test.csv")
input_test, desired_op_test = preprocess(df_test)
loss, accuracy = pos_model.evaluate(input_test, desired_op_test)
print( loss, accuracy)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8717 - loss: 0.5199
0.5098825693130493 0.8743162155151367
