In [250]:
# Importing Libraries

from tensorflow.keras.layers import LSTM,Dense,Activation
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
import re
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.optimizers import RMSprop
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
    

In [251]:
# Importing training and testing data
train = pd.read_csv("C:/Users/roboc/Downloads/train 2.csv")
test = pd.read_csv("C:/Users/roboc/Downloads/test 2.csv")
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [252]:
train.shape

(19579, 3)

In [253]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [254]:
# Concatenating both training an testing to increase the data
df= pd.concat([train,test],axis=0)

In [255]:
# Dropping unwanted columns
df.drop(columns=['author'],inplace=True)

In [256]:
max_word_length = max(len(word) for text in df['text'].values for word in text.split())
print("Maximum word length:", max_word_length)

Maximum word length: 56


In [257]:
# Creating an instance for TextVectorization and Creating an instance for TextVectorization
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(max_tokens=28729, output_mode='int', output_sequence_length=56, pad_to_max_tokens=False)

In [258]:

vectorizer.adapt(df['text'])

In [259]:
vectorized_text = vectorizer(df['text'])
vectorized_text_array = vectorized_text.numpy()
vectorized_text_array.shape

(27971, 56)

In [260]:
vocabulary = vectorizer.get_vocabulary()
len(vocabulary)

28729

In [261]:
vectorized_text

<tf.Tensor: shape=(27971, 56), dtype=int64, numpy=
array([[   27,  3391,   142, ...,     0,     0,     0],
       [   12,    91,   129, ...,     0,     0,     0],
       [    8,    16,   155, ...,     0,     0,     0],
       ...,
       [   12,    26,   672, ...,     0,     0,     0],
       [   29,    27,    17, ...,     0,     0,     0],
       [  109, 14643, 20340, ...,     0,     0,     0]], dtype=int64)>

In [262]:
vectorized_text[0]

<tf.Tensor: shape=(56,), dtype=int64, numpy=
array([  27, 3391,  142, 1323,   23,   38,  285,    3, 6915,    2, 2388,
          3,   11, 4588,   17,    6,   81,  190,   47, 3964,    4,  304,
          5,    2,  262, 2218,    6,  321,   75,  137,  128,  936,    3,
          2,  307,   41, 1502, 4197,   99,    2,  445,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)>

In [263]:
# Putting everything into a list within a list

list_of_list = []
for sentence in vectorized_text_array:
    word_list = []
    for word in sentence:
        if word != 0:
            word_list.append(word)
    list_of_list.append(word_list)

list_of_list

[[27,
  3391,
  142,
  1323,
  23,
  38,
  285,
  3,
  6915,
  2,
  2388,
  3,
  11,
  4588,
  17,
  6,
  81,
  190,
  47,
  3964,
  4,
  304,
  5,
  2,
  262,
  2218,
  6,
  321,
  75,
  137,
  128,
  936,
  3,
  2,
  307,
  41,
  1502,
  4197,
  99,
  2,
  445],
 [12, 91, 129, 729, 5, 23, 10, 2, 6260, 81, 29, 7, 452, 2487],
 [8,
  16,
  155,
  174,
  9,
  7,
  692,
  6053,
  563,
  25,
  20,
  17,
  15,
  18165,
  120,
  2,
  517,
  4853,
  33,
  256,
  3,
  1423,
  530,
  15,
  215,
  6053,
  7316,
  14,
  35,
  203,
  3,
  2,
  791,
  354,
  528,
  3136],
 [121,
  570,
  26,
  769,
  17,
  36,
  216,
  25,
  757,
  4208,
  28,
  2,
  3027,
  3311,
  10357,
  852,
  352,
  10859,
  24,
  413,
  5894,
  4,
  14679,
  1785,
  33,
  216,
  17,
  8,
  609,
  148,
  136,
  10426,
  4,
  861],
 [1241,
  162,
  743,
  21,
  76,
  692,
  2,
  5278,
  1491,
  16,
  2268,
  19,
  7,
  5728,
  258,
  1710,
  15270,
  88,
  16,
  401,
  17,
  15,
  6058,
  1357,
  22,
  16,
  5872],
 [7,
  458,

In [264]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y_list = []
for sentence in list_of_list:
    last_element = sentence[-1]
    y_list.append(last_element)

y_list = np.array(y_list)
y_list

array([  445,  2487,  3136, ...,   541, 17436,  1578], dtype=int64)

In [None]:
# Convert into X and Y for training and testing spllit

from tensorflow.keras.utils import to_categorical
X = pad_sequences(list_of_list, maxlen=56, padding='pre')
y = to_categorical(y_list, num_classes=28729)
X = X[:, :-1]

In [None]:
X[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,   12,   91,
        129,  729,    5,   23,   10,    2, 6260,   81,   29,    7,  452])

In [None]:
y_list

array([  445,  2487,  3136, ...,   541, 17436,  1578], dtype=int64)

In [None]:
positions_of_ones = np.argmax(y, axis=1)

print("Positions of ones in y:", positions_of_ones)

Positions of ones in y: [  445  2487  3136 ...   541 17436  1578]


In [None]:
y_list = []
for sentence in list_of_list:
    last_element = sentence[-1]
    y_list.append(last_element)

y_list = np.array(y_list)
y_list

array([  445,  2487,  3136, ...,   541, 17436,  1578], dtype=int64)

In [None]:
X.shape , y.shape

((27971, 55), (27971, 28729))

In [None]:
max_sequence_length = max(len(sequence) for sequence in X)
max_sequence_length

55

In [None]:
# Neural network creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(28729, 14, input_length=max_sequence_length), # Assuming max_sequence_length is defined
    tf.keras.layers.LSTM(100, return_sequences=True), # First LSTM layer with return_sequences=True
    tf.keras.layers.LSTM(100), # Second LSTM layer
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(28729, activation='softmax'),
])


In [None]:
len(vocabulary)

28729

In [None]:
# Converting into training an dtesting data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test , y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [None]:
# Creating an optimizer(Adam) and defining loss function
model.compile(
    optimizer=Adam(),  # Optimizer (e.g., Adam, SGD)
    loss='categorical_crossentropy',  # Loss function
    metrics=['accuracy']  # List of metrics to evaluate during training
)
model.fit(X_train,y_train,epochs=10)

In [None]:
# Predicting the test data and getting an almost 95% accuracy
y_pred = model.predict(X_test)
accuracy_score= accuracy_score(y_pred,y_test)
print("Test Accuracy :",accuracy_score)

Test Accuracy: 0.9410152526215443
