**Text Generation Project with SpaCy**
# I have used a dataset the four chapters of moby dick novel for text generation.


In [None]:
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser','tagger','ner'])

In [2]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [4]:
read_file('moby_dick_four_chapters.txt')

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [10]:
nlp.max_length = 1198623

In [11]:
def sep_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [12]:
d = read_file('moby_dick_four_chapters.txt')

In [13]:
tokens = sep_punc(d)



In [18]:
train_len = 25 +1 
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [117]:
len(tokens)

11338

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [25]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [26]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2718

In [27]:
import numpy as np
sequences = np.array(sequences)

In [28]:
sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

In [29]:
from keras.utils import to_categorical

In [30]:
X = sequences[:,:-1]
y = sequences[:,-1]

In [32]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [33]:
X.shape

(11312, 25)

In [47]:
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense,LSTM,Embedding,Input

def create_model(vocabulary_size, seq_len, embedding_dim=50):
    # Define input layer with the shape (seq_len,)
    input_layer = Input(shape=(seq_len,))
    
    # Embedding layer
    embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=seq_len)(input_layer)
       
    # First LSTM layer with return_sequences=True
    lstm_out_1 = LSTM(50, return_sequences=True)(embedding_layer)
    
    # Second LSTM layer
    lstm_out_2 = LSTM(50)(lstm_out_1)
    
    # Dense layer with ReLU activation
    dense_out = Dense(50, activation='relu')(lstm_out_2)
    
    # Output layer with softmax activation
    output_layer = Dense(vocabulary_size, activation='softmax')(dense_out)
    
    # Define the model
    model = Model(inputs=input_layer, outputs=output_layer)
    
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Print model summary
    model.summary()
    
    return model

In [49]:
seq_len = X.shape[1]

In [51]:
model = create_model(vocabulary_size+1,seq_len)



In [53]:
from pickle import dump,load

In [55]:
model.fit(X,y,batch_size=128,epochs=350,verbose=1)

Epoch 1/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.0444 - loss: 7.4343
Epoch 2/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.0537 - loss: 6.3246
Epoch 3/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0523 - loss: 6.3058
Epoch 4/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0506 - loss: 6.1935
Epoch 5/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.0515 - loss: 6.1457
Epoch 6/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0518 - loss: 6.0284
Epoch 7/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0567 - loss: 5.9214
Epoch 8/350
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.0625 - loss: 5.8628
Epoch 9/350
[1m89/89[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1a76a9850a0>

In [57]:
model.save('my_mobydick_model.h5')



In [59]:
dump(tokenizer,open('my_simpletokenizer','wb'))

In [63]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for _ in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_probs = model.predict(pad_encoded, verbose=0)[0]
        
        # Get the index of the word with the highest probability
        pred_word_ind = np.argmax(pred_probs)
        
        # Convert index to word
        pred_word = tokenizer.index_word.get(pred_word_ind, '')  # Handle out-of-vocabulary cases
        
        if pred_word:
            input_text += ' ' + pred_word
            output_text.append(pred_word)
        else:
            break  # Stop generation if no valid word is found
        
    return ' '.join(output_text)


In [109]:
import random
random_pick = random.randint(0,len(text_sequences))

In [135]:
random_seed_text = text_sequences[random_pick+1]
random_seed_text

['for',
 'cheap',
 'lodgings',
 'and',
 'the',
 'best',
 'of',
 'pea',
 'coffee',
 'it',
 'was',
 'a',
 'queer',
 'sort',
 'of',
 'place',
 'a',
 'gable',
 'ended',
 'old',
 'house',
 'one',
 'side',
 'palsied',
 'as',
 'it']

In [113]:
seed_text = ' '.join(random_seed_text)

In [115]:
generate_text(model,tokenizer,seq_len,seed_text,25)

"it were a hatchet faced baby one sally together in putting i to myself not a passenger did when what i s'pose you then that"