### Question 2


In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

df = pd.read_csv('Shakespeare_data.csv')

# Combine and preprocess the 'PlayerLine' text data
player_lines = df['PlayerLine'].dropna().astype(str).tolist()
text_corpus = ' '.join(player_lines).lower()

In [9]:
df

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
...,...,...,...,...,...,...
111391,111392,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely"
111392,111393,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part
111393,111394,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first
111394,111395,A Winters Tale,38.0,5.3.183,LEONTES,We were dissever'd: hastily lead away.


In [2]:
# Download stopwords and filter them out from the corpus
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in text_corpus.split() if word not in stop_words]
filtered_text = ' '.join(filtered_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Initialize the tokenizer and limit vocabulary size
max_vocab_size = 3000
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts([filtered_text])


In [4]:
# Create input sequences
input_sequences = []
for sentence in filtered_text.split('.'):
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        sequence = token_list[:i + 1]
        input_sequences.append(sequence)


In [5]:
 # Define maximum sequence length and pad sequences to uniform size
max_sequence_len = 5
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
# Ensure labels are stored as integers for sparse categorical loss
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y, dtype=np.int32)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build the LSTM model
model = Sequential([
    Embedding(input_dim=min(len(tokenizer.word_index) + 1, max_vocab_size), 
              output_dim=100, input_length=max_sequence_len - 1),
    LSTM(150),
    Dense(max_vocab_size, activation='softmax')
])

In [7]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#training and testing
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), verbose=1)
#saving the model for API
model.save('word_completion_model_v2.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [8]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 4.49%
