In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils 
import tensorflow as tf
import numpy as np 
import pandas as pd
import numpy as np

2023-09-10 01:35:06.403601: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-10 01:35:06.432483: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 01:35:06.634283: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-10 01:35:06.635681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv('Shakespeare_data.csv', nrows = 50000)
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [4]:
import csv

corpus = []

with open('Shakespeare_data.csv') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)        # to pass first row,header
    for row in reader:
        corpus.append(row[5])

corpus = corpus[:50000]       
print(len(corpus))
print(corpus[:3])


50000
['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others']


In [5]:
import string

def text_cleaner(text):
    text = "".join(car for car in text if car not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    return text

corpus = [text_cleaner(line) for line in corpus[:50000]]


In [6]:
# Tokenization is the process of splitting up a text into a list of individual words, or tokens.
# corpus is too big if you try with all data, you can see this message
corpus = corpus[:3000]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_words = len(word_index) + 1
total_words

3759

In [7]:
# create input sequences using list of tokens
input_sequences =[]

for sentence in corpus:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        

In [8]:
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, 
                                         maxlen=max_sequence_len, 
                                         padding='pre'))

In [9]:
# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
# create one-hot encoding of the labels
label = tensorflow.keras.utils.to_categorical(label, num_classes=total_words)

In [10]:
print(label[0])
print(label[0].shape)

[0. 0. 0. ... 0. 0. 0.]
(3759,)


In [11]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='relu'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29, 10)            37590     
                                                                 
 bidirectional (Bidirection  (None, 200)               88800     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 3759)              755559    
                                                                 
Total params: 881949 (3.36 MB)
Trainable params: 881949 (3.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [12]:
history = model.fit(predictors, label, epochs=5,  verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
seed_text = "help me in this"
next_words = 10

  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    c = np.argmax(predicted, axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == c:
            output_word = word
            break
    seed_text += " " + output_word
    if len(seed_text) % 10 == 0 :
        seed_text+= '\n'
print(seed_text)

help me in this the the the the the the the the the the


In [15]:
seed_text = "Love all, trust a few"
next_words = 10

  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    c = np.argmax(predicted, axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == c:
            output_word = word
            break
    seed_text += " " + output_word
    if len(seed_text) % 10 == 0 :
        seed_text+= '\n'
print(seed_text)

Love all, trust a few the the the the the the the the the the


In [28]:
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
from tensorflow.keras.callbacks import EarlyStopping

def build_model(hp):
    model = Sequential()
    model.add(Embedding(total_words, hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=max_sequence_len-1))
    model.add(LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32), dropout=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.5,step=0.1)))
    model.add(Dense(total_words, activation=hp.Choice('dense_activation',values=['relu','sigmoid','softmax']),kernel_initializer='he_normal'))
    model.compile(optimizer='adam', loss=hp.Choice('loss_fn',values=['binary_crossentropy','categorical_crossentropy']), metrics=['accuracy'])
    return model

# Initialize Keras Tuner RandomSearch
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    directory='tuner_dir',
    project_name='lstm_sentiment'
)

# Perform hyperparameter search
tuner.search(predictors, label, validation_split=0.2, epochs=3)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
print(best_hps)

# Build the final model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
model.fit(predictors, label, validation_split=0.2, epochs=5, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
model.summary()

INFO:tensorflow:Reloading Tuner from tuner_dir/lstm_sentiment/tuner0.json
INFO:tensorflow:Oracle triggered exit
Best Hyperparameters:
<keras_tuner.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7f0ae0236710>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 29, 32)            120288    
                                                                 
 lstm_6 (LSTM)               (None, 128)               82432     
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 3759)              484911    
                                                                 
Total params: 687631 (2.62 MB)
Trainable params: 6

In [29]:
# Print the chosen activation function and loss function
best_activation = best_hps.get('dense_activation')
best_loss_function = best_hps.get('loss_fn')
best_em_dim = best_hps.get('embedding_dim')
best_units = best_hps.get('lstm_units')
best_dropout = best_hps.get('Dropout_rate')
print("Chosen embedding dimension:", best_em_dim)
print("Chosen number of LSTM units:", best_units)
print("Chosen dropout rate:", best_dropout)
print("Chosen Activation Function:", best_activation)
print("Chosen Loss Function:", best_loss_function)

Chosen embedding dimension: 32
Chosen number of LSTM units: 128
Chosen dropout rate: 0.0
Chosen Activation Function: softmax
Chosen Loss Function: categorical_crossentropy


In [25]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 29, 10)            37590     
                                                                 
 bidirectional_3 (Bidirecti  (None, 200)               88800     
 onal)                                                           
                                                                 
 dropout_5 (Dropout)         (None, 200)               0         
                                                                 
 dense_5 (Dense)             (None, 3759)              755559    
                                                                 
Total params: 881949 (3.36 MB)
Trainable params: 881949 (3.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [26]:
history = model.fit(predictors, label, epochs=5,  verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
seed_text = "help me in this"
next_words = 10

  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    c = np.argmax(predicted, axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == c:
            output_word = word
            break
    seed_text += " " + output_word
    if len(seed_text) % 10 == 0 :
        seed_text+= '\n'
print(seed_text)

In [None]:
seed_text = "Love all, trust a few"
next_words = 10

  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    c = np.argmax(predicted, axis = 1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == c:
            output_word = word
            break
    seed_text += " " + output_word
    if len(seed_text) % 10 == 0 :
        seed_text+= '\n'
print(seed_text)