In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk

In [2]:
with open('/Users/adithyaram/Desktop/alice_in_wonderland.txt', 'r') as f:
    data = f.read()
    
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 148574
First 300 letters of the data
-------


"Alice's Adventures in Wonderland\n\n                ALICE'S ADVENTURES IN WONDERLAND\n\n                          Lewis Carroll\n\n               THE MILLENNIUM FULCRUM EDITION 3.0\n\n\n\n\n                            CHAPTER I\n\n                      Down the Rabbit-Hole\n\n\n  Alice was beginning to get very tir"

-------
Last 300 letters of the data
-------


'eyes bright and eager\nwith many a strange tale, perhaps even with the dream of\nWonderland of long ago:  and how she would feel with all their\nsimple sorrows, and find a pleasure in all their simple joys,\nremembering her own child-life, and the happy summer days.\n\n                             THE END'

-------


In [3]:
def split(data):
    sentences = data.split('\n')
    sentences = [s.strip(" ") for s in sentences]   
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences

In [4]:
# test your code
x = """I have a pen.\nI have an apple. \nA\nApple pen.\n"""
print(x)

split(x)

I have a pen.
I have an apple. 
A
Apple pen.



['I have a pen.', 'I have an apple.', 'A', 'Apple pen.']

In [5]:
def tokenize(sentences):
    
    tokenized_sentences = []
    
    for sentence in sentences:
        sentence = sentence.lower()
        tokens = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokens)
        
    return tokenized_sentences
    

In [6]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
tokenize(sentences)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green', '.'],
 ['roses', 'are', 'red', '.']]

In [7]:
def tokenized_data(data):
    
    sentences = split(data)
    tokenized_sentences = tokenize(sentences)
    
    return tokenized_sentences
    

In [8]:
x = "Sky is blue.\nLeaves are green\nRoses are red."
tokenized_data(x)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [9]:
input = tokenized_data(data)
random.seed(9)
random.shuffle(input)

train_size = int(len(input) * 0.8)
train_data = input[0:train_size]
test_data = input[train_size:]


In [10]:
print("{} data is split into {} train and {} test data set".format(len(input), len(train_data), len(test_data)))

2726 data is split into 2180 train and 546 test data set


In [11]:
print("First training sample:")
print(train_data[0])

print("First test sample:")
print(test_data[0])

First training sample:
['lessons', 'in', 'the', 'schoolroom', ',', 'and', 'though', 'this', 'was', 'not', 'a', 'very', 'good']
First test sample:
['the', 'jury', ',', 'of', 'course', '--', '``', 'i', 'gave', 'her', 'one', ',', 'they', 'gave', 'him', 'two', '--', "''", 'why', ',']


In [12]:
def count(tokenized_sentences):
    word_counts = {}
    
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token]+=1
    return word_counts
                

In [13]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count(tokenized_sentences)

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 3,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

In [14]:
word_counts = count(train_data)
word_counts

{'lessons': 10,
 'in': 300,
 'the': 1310,
 'schoolroom': 1,
 ',': 1930,
 'and': 674,
 'though': 10,
 'this': 108,
 'was': 284,
 'not': 115,
 'a': 505,
 'very': 108,
 'good': 19,
 'alice': 325,
 'sighing': 3,
 '.': 778,
 'she': 429,
 'comes': 2,
 'to': 581,
 '--': 223,
 "'": 907,
 'at': 172,
 'moment': 25,
 'five': 7,
 'who': 51,
 'had': 143,
 'been': 33,
 'anxiously': 12,
 'grown': 6,
 'so': 120,
 'large': 23,
 'last': 27,
 'few': 9,
 'minutes': 11,
 'that': 262,
 "n't": 178,
 'bit': 13,
 'were': 68,
 'placed': 1,
 'along': 5,
 'course': 19,
 'here': 41,
 'there': 80,
 'no': 71,
 '`': 903,
 'one': 85,
 'further': 3,
 'off': 59,
 'from': 25,
 'england': 1,
 'nearer': 4,
 'is': 98,
 'france': 1,
 'sort': 14,
 'it': 470,
 ')': 44,
 'scratching': 1,
 'scrambling': 1,
 'about': 77,
 'chimney': 5,
 'close': 9,
 'turtle': 52,
 'angrily': 6,
 ':': 188,
 'really': 10,
 'you': 338,
 'are': 44,
 'dull': 3,
 '!': 360,
 'felt': 21,
 'could': 68,
 'be': 124,
 'denied': 2,
 'tried': 19,
 'another': 2

In [15]:
vocabulary = sorted(word_counts, key=word_counts.get, reverse=True)
vocabulary

[',',
 'the',
 "'",
 '`',
 '.',
 'and',
 'to',
 'a',
 'it',
 'i',
 'she',
 'of',
 'said',
 '!',
 'you',
 'alice',
 'in',
 'was',
 'that',
 '--',
 'as',
 'her',
 ':',
 "n't",
 'at',
 "'s",
 '?',
 'on',
 ';',
 'all',
 'had',
 'with',
 'but',
 'for',
 'be',
 'so',
 'what',
 'do',
 'they',
 'not',
 'this',
 'very',
 'he',
 'is',
 'little',
 'out',
 'one',
 'down',
 'there',
 'about',
 'then',
 'would',
 'up',
 'know',
 'his',
 'if',
 'went',
 'no',
 'them',
 'like',
 'were',
 'could',
 'herself',
 'have',
 'again',
 'did',
 'or',
 'queen',
 'off',
 'when',
 'thought',
 '*',
 'turtle',
 'time',
 'who',
 "''",
 'me',
 'see',
 "'m",
 'how',
 'king',
 'into',
 'mock',
 'well',
 'hatter',
 'an',
 'your',
 'some',
 'gryphon',
 'by',
 "'ll",
 'now',
 'say',
 'quite',
 'think',
 ')',
 'are',
 'began',
 'first',
 'their',
 'much',
 'way',
 '(',
 'here',
 'my',
 'more',
 '``',
 'just',
 'come',
 'go',
 'its',
 'head',
 'thing',
 'voice',
 'only',
 'round',
 'any',
 'rabbit',
 'duchess',
 'got',
 'tw

In [16]:
word_to_index = {word: index for index, word in enumerate(vocabulary)}
index_to_word = {index: word for index, word in enumerate(vocabulary)}

In [17]:
word_to_index

{',': 0,
 'the': 1,
 "'": 2,
 '`': 3,
 '.': 4,
 'and': 5,
 'to': 6,
 'a': 7,
 'it': 8,
 'i': 9,
 'she': 10,
 'of': 11,
 'said': 12,
 '!': 13,
 'you': 14,
 'alice': 15,
 'in': 16,
 'was': 17,
 'that': 18,
 '--': 19,
 'as': 20,
 'her': 21,
 ':': 22,
 "n't": 23,
 'at': 24,
 "'s": 25,
 '?': 26,
 'on': 27,
 ';': 28,
 'all': 29,
 'had': 30,
 'with': 31,
 'but': 32,
 'for': 33,
 'be': 34,
 'so': 35,
 'what': 36,
 'do': 37,
 'they': 38,
 'not': 39,
 'this': 40,
 'very': 41,
 'he': 42,
 'is': 43,
 'little': 44,
 'out': 45,
 'one': 46,
 'down': 47,
 'there': 48,
 'about': 49,
 'then': 50,
 'would': 51,
 'up': 52,
 'know': 53,
 'his': 54,
 'if': 55,
 'went': 56,
 'no': 57,
 'them': 58,
 'like': 59,
 'were': 60,
 'could': 61,
 'herself': 62,
 'have': 63,
 'again': 64,
 'did': 65,
 'or': 66,
 'queen': 67,
 'off': 68,
 'when': 69,
 'thought': 70,
 '*': 71,
 'turtle': 72,
 'time': 73,
 'who': 74,
 "''": 75,
 'me': 76,
 'see': 77,
 "'m": 78,
 'how': 79,
 'king': 80,
 'into': 81,
 'mock': 82,
 'well': 

In [29]:
index_to_word

{0: ',',
 1: 'the',
 2: "'",
 3: '`',
 4: '.',
 5: 'and',
 6: 'to',
 7: 'a',
 8: 'it',
 9: 'i',
 10: 'she',
 11: 'of',
 12: 'said',
 13: '!',
 14: 'you',
 15: 'alice',
 16: 'in',
 17: 'was',
 18: 'that',
 19: '--',
 20: 'as',
 21: 'her',
 22: ':',
 23: "n't",
 24: 'at',
 25: "'s",
 26: '?',
 27: 'on',
 28: ';',
 29: 'all',
 30: 'had',
 31: 'with',
 32: 'but',
 33: 'for',
 34: 'be',
 35: 'so',
 36: 'what',
 37: 'do',
 38: 'they',
 39: 'not',
 40: 'this',
 41: 'very',
 42: 'he',
 43: 'is',
 44: 'little',
 45: 'out',
 46: 'one',
 47: 'down',
 48: 'there',
 49: 'about',
 50: 'then',
 51: 'would',
 52: 'up',
 53: 'know',
 54: 'his',
 55: 'if',
 56: 'went',
 57: 'no',
 58: 'them',
 59: 'like',
 60: 'were',
 61: 'could',
 62: 'herself',
 63: 'have',
 64: 'again',
 65: 'did',
 66: 'or',
 67: 'queen',
 68: 'off',
 69: 'when',
 70: 'thought',
 71: '*',
 72: 'turtle',
 73: 'time',
 74: 'who',
 75: "''",
 76: 'me',
 77: 'see',
 78: "'m",
 79: 'how',
 80: 'king',
 81: 'into',
 82: 'mock',
 83: 'wel

In [18]:
sequences = []
for sentence in train_data:
    sequence = [word_to_index[word] for word in sentence]
    sequences.append(sequence)
sequences

[[302, 16, 1, 1271, 0, 5, 303, 40, 17, 39, 7, 41, 192],
 [15, 0, 5, 699, 4],
 [10, 911, 0, 6, 19, 2, 24, 40, 156, 400, 0, 74, 30, 127, 253],
 [445, 35, 169, 16, 1, 148, 320, 274, 18, 10, 17, 23, 7, 237],
 [60, 1272, 495, 1, 193, 0, 103, 5, 48, 4, 48, 17, 57, 3, 46, 0],
 [1, 700, 68, 157, 1273, 1, 578, 43, 6, 1274, 19],
 [228, 8, 17, 95, 1275, 5, 1276, 49, 16, 1, 496, 321],
 [72, 446, 22, 3, 304, 14, 96, 41, 701, 13, 2],
 [15, 177, 18, 40, 61, 39, 34, 912, 0, 35, 10, 194, 185],
 [322,
  302,
  6,
  497,
  13,
  57,
  0,
  9,
  128,
  186,
  52,
  104,
  401,
  49,
  8,
  28,
  55,
  9,
  78],
 [9,
  254,
  52,
  0,
  9,
  90,
  579,
  46,
  19,
  32,
  9,
  78,
  445,
  52,
  91,
  0,
  2,
  10,
  209,
  16,
  7],
 [149, 18, 21, 702, 60, 913, 6, 34, 149, 22, 29, 10, 61],
 [115, 4],
 [36, 8, 174, 914, 6, 402, 18, 36, 14, 60, 66, 174, 63],
 [3, 9, 134, 144, 14, 105, 195, 18, 0, 55, 14, 59, 0, 2, 12, 1],
 [3, 50, 14, 255, 403, 47, 0, 2, 1, 80, 170, 4],
 [404, 33, 580, 13, 9, 90, 275, 5, 92

In [19]:
input_sequences = []
output_words = []
max_sequence_length = 5
for sequence in sequences:
    for i in range(len(sequence) - max_sequence_length):
        input_sequence = sequence[i:i + max_sequence_length]
        output_word = sequence[i + max_sequence_length]
        input_sequences.append(input_sequence)
        output_words.append(output_word)
        
input_sequences

[[302, 16, 1, 1271, 0],
 [16, 1, 1271, 0, 5],
 [1, 1271, 0, 5, 303],
 [1271, 0, 5, 303, 40],
 [0, 5, 303, 40, 17],
 [5, 303, 40, 17, 39],
 [303, 40, 17, 39, 7],
 [40, 17, 39, 7, 41],
 [10, 911, 0, 6, 19],
 [911, 0, 6, 19, 2],
 [0, 6, 19, 2, 24],
 [6, 19, 2, 24, 40],
 [19, 2, 24, 40, 156],
 [2, 24, 40, 156, 400],
 [24, 40, 156, 400, 0],
 [40, 156, 400, 0, 74],
 [156, 400, 0, 74, 30],
 [400, 0, 74, 30, 127],
 [445, 35, 169, 16, 1],
 [35, 169, 16, 1, 148],
 [169, 16, 1, 148, 320],
 [16, 1, 148, 320, 274],
 [1, 148, 320, 274, 18],
 [148, 320, 274, 18, 10],
 [320, 274, 18, 10, 17],
 [274, 18, 10, 17, 23],
 [18, 10, 17, 23, 7],
 [60, 1272, 495, 1, 193],
 [1272, 495, 1, 193, 0],
 [495, 1, 193, 0, 103],
 [1, 193, 0, 103, 5],
 [193, 0, 103, 5, 48],
 [0, 103, 5, 48, 4],
 [103, 5, 48, 4, 48],
 [5, 48, 4, 48, 17],
 [48, 4, 48, 17, 57],
 [4, 48, 17, 57, 3],
 [48, 17, 57, 3, 46],
 [1, 700, 68, 157, 1273],
 [700, 68, 157, 1273, 1],
 [68, 157, 1273, 1, 578],
 [157, 1273, 1, 578, 43],
 [1273, 1, 578, 4

In [20]:
len(input_sequences)

16977

In [21]:
len(output_words)

16977

In [22]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

X = pad_sequences(input_sequences, maxlen=max_sequence_length)
y = to_categorical(output_words, num_classes=len(vocabulary))

In [23]:
embedding_dim = 100
lstm_units = 120

In [24]:
model = Sequential()
model.add(Embedding(len(vocabulary), embedding_dim, input_length=max_sequence_length))
model.add(LSTM(lstm_units))
model.add(Dense(len(vocabulary), activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x29b424710>

In [26]:
input_sequence = "A white rabbit"
input_sequence = input_sequence.lower().split()
input_sequence

['a', 'white', 'rabbit']

In [27]:
desired_sentence_length = 6

for _ in range(desired_sentence_length):
    encoded_sequence = [word_to_index[word] for word in input_sequence[-max_sequence_length:]]
    padded_sequence = pad_sequences([encoded_sequence], maxlen=max_sequence_length)
    predicted_probabilities = model.predict(padded_sequence)
    print(model.predict(padded_sequence))
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = index_to_word[predicted_index]
    input_sequence.append(predicted_word)

[[1.3411840e-02 4.5835169e-04 1.1051152e-07 ... 5.1161447e-07
  5.8178831e-09 1.1947761e-04]]
[[4.0867174e-01 2.2845280e-03 3.0600157e-05 ... 1.3951039e-11
  1.2758006e-10 1.4339489e-05]]
[[5.5759768e-03 5.1866174e-03 4.5614168e-02 ... 2.2360370e-11
  9.1745571e-09 5.1044924e-05]]
[[1.5817737e-03 1.4931544e-02 3.8915316e-03 ... 6.5846628e-13
  8.1489021e-10 2.7280496e-06]]
[[8.0807665e-03 3.9681051e-02 1.3342162e-03 ... 6.8624363e-11
  5.1379097e-09 2.2616172e-07]]
[[2.11848971e-03 1.48229365e-05 7.83838273e-04 ... 1.33021842e-08
  1.42000225e-08 1.35807920e-06]]


In [28]:
completed_sentence = ' '.join(input_sequence)
print("Completed sentence:", completed_sentence)

Completed sentence: a white rabbit read , ` and most of
