In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import tensorflow as tf


In [83]:
url = "https://en.wikipedia.org/wiki/Radiohead"
page = requests.get(url)
soup = BeautifulSoup(page.text,"html.parser")
paragraphs = soup.findAll("p")
text_list = [paragraph.get_text() for paragraph in paragraphs]
text_list

['\n',
 "Radiohead are an English rock band formed in Abingdon, Oxfordshire, in 1985. They comprise Thom Yorke (vocals, guitar, piano, keyboards); brothers Jonny Greenwood (guitar, keyboards, other instruments) and Colin Greenwood (bass); Ed O'Brien (guitar, backing vocals); and Philip Selway (drums, percussion). They have worked with the producer Nigel Godrich and the cover artist Stanley Donwood since 1994. Radiohead's experimental approach is credited with advancing the sound of alternative rock.\n",
 'Radiohead signed to EMI in 1991 and released their debut album, Pablo Honey, in 1993. Their debut single, "Creep", was a worldwide hit, and their popularity and critical standing rose with The Bends in 1995. Their third album, OK Computer (1997), is acclaimed as a landmark record and one of the greatest albums in popular music, with complex production and themes of modern alienation. Their fourth album, Kid A (2000), marked a dramatic change in style, incorporating influences from ele

In [84]:
full_text = ''.join(text_list)
print(full_text)


Radiohead are an English rock band formed in Abingdon, Oxfordshire, in 1985. They comprise Thom Yorke (vocals, guitar, piano, keyboards); brothers Jonny Greenwood (guitar, keyboards, other instruments) and Colin Greenwood (bass); Ed O'Brien (guitar, backing vocals); and Philip Selway (drums, percussion). They have worked with the producer Nigel Godrich and the cover artist Stanley Donwood since 1994. Radiohead's experimental approach is credited with advancing the sound of alternative rock.
Radiohead signed to EMI in 1991 and released their debut album, Pablo Honey, in 1993. Their debut single, "Creep", was a worldwide hit, and their popularity and critical standing rose with The Bends in 1995. Their third album, OK Computer (1997), is acclaimed as a landmark record and one of the greatest albums in popular music, with complex production and themes of modern alienation. Their fourth album, Kid A (2000), marked a dramatic change in style, incorporating influences from electronic music,

In [85]:
import re
text = full_text.lower()
text = re.sub("[^a-z ]"," ",text)
text = text.replace("  ","")


In [86]:
from tensorflow.keras.preprocessing.text import Tokenizer
Tokenizer = Tokenizer()
Tokenizer.fit_on_texts([text])

In [87]:
import pickle
with open("token.pkl","wb")as file:
    pickle.dump(Tokenizer,file)

In [88]:
sequence_data = Tokenizer.texts_to_sequences([text])[0]
print(sequence_data[:10])

[6, 131, 31, 420, 28, 26, 229, 4, 716, 29]


In [89]:
Tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'in': 4,
 'a': 5,
 'radiohead': 6,
 'to': 7,
 's': 8,
 'for': 9,
 'their': 10,
 'was': 11,
 'on': 12,
 'with': 13,
 'by': 14,
 'as': 15,
 'released': 16,
 'were': 17,
 'it': 18,
 'album': 19,
 'that': 20,
 'music': 21,
 'yorke': 22,
 'from': 23,
 'greenwood': 24,
 'at': 25,
 'band': 26,
 'had': 27,
 'rock': 28,
 'they': 29,
 'first': 30,
 'an': 31,
 'jonny': 32,
 'his': 33,
 'one': 34,
 'new': 35,
 'best': 36,
 'albums': 37,
 'all': 38,
 'not': 39,
 'said': 40,
 'have': 41,
 'is': 42,
 'kid': 43,
 'more': 44,
 'he': 45,
 'emi': 46,
 'recorded': 47,
 'us': 48,
 'songs': 49,
 'number': 50,
 'work': 51,
 'time': 52,
 'early': 53,
 'brien': 54,
 'uk': 55,
 'also': 56,
 'recording': 57,
 'tour': 58,
 'began': 59,
 'described': 60,
 'record': 61,
 'king': 62,
 'solo': 63,
 'most': 64,
 'them': 65,
 'song': 66,
 'has': 67,
 'godrich': 68,
 'since': 69,
 'chart': 70,
 'its': 71,
 'you': 72,
 'played': 73,
 'bands': 74,
 'such': 75,
 'performed': 76,
 'several':

In [90]:
vocab_size = len(Tokenizer.word_index) + 1
vocab_size

2616

In [91]:
# now let's get 4 words in sequence
sequence = [] 

for i in range(3,len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequence.append(words)


In [92]:
sequence[:10]

[[6, 131, 31, 420],
 [131, 31, 420, 28],
 [31, 420, 28, 26],
 [420, 28, 26, 229],
 [28, 26, 229, 4],
 [26, 229, 4, 716],
 [229, 4, 716, 29],
 [4, 716, 29, 717],
 [716, 29, 717, 292],
 [29, 717, 292, 718]]

In [93]:
X,y = [],[]
for i in sequence:
    X.append(i[0:3])
    y.append(i[3])

In [94]:
X[:10]

[[6, 131, 31],
 [131, 31, 420],
 [31, 420, 28],
 [420, 28, 26],
 [28, 26, 229],
 [26, 229, 4],
 [229, 4, 716],
 [4, 716, 29],
 [716, 29, 717],
 [29, 717, 292]]

In [95]:
X = np.array(X)
y= np.array(y)

In [96]:
y[:10]

array([420,  28,  26, 229,   4, 716,  29, 717, 292, 718])

In [97]:
from keras.utils import to_categorical
y = to_categorical(y,num_classes=vocab_size)

In [98]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()

model.add(Embedding(vocab_size,10,input_length=3))
model.add(Dropout(0.2))
model.add(LSTM(1000,return_sequences=True,dropout=0.3, recurrent_dropout = 0.3))
model.add(LSTM(1000,dropout=0.3,recurrent_dropout = 0.3))

model.add(Dense(1000, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(vocab_size, activation="softmax"))
model.summary()

In [99]:
#EarlyStopping: Callback to stop training when a certain condition is met
early_stop = EarlyStopping(monitor="loss",patience=3)
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [100]:
training = model.fit(X,y,epochs=500,callbacks=[early_stop])

Epoch 1/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 104ms/step - accuracy: 0.0492 - loss: 7.1036
Epoch 2/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 100ms/step - accuracy: 0.0590 - loss: 6.5088
Epoch 3/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 99ms/step - accuracy: 0.0588 - loss: 6.4687
Epoch 4/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 98ms/step - accuracy: 0.0604 - loss: 6.3005
Epoch 5/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 98ms/step - accuracy: 0.0597 - loss: 6.1408
Epoch 6/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 99ms/step - accuracy: 0.0683 - loss: 6.0574
Epoch 7/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 98ms/step - accuracy: 0.0697 - loss: 5.8712
Epoch 8/500
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 99ms/step - accuracy: 0.0807 - loss: 5.6923
Epoch 9/500
[

In [101]:
model.save("model.h5")



In [102]:
from keras.models import load_model
model = load_model('model.h5')



In [110]:
model.evaluate(X,y)

[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4900 - loss: 2.1628


[2.2018537521362305, 0.49050062894821167]

In [103]:
def predict_next_word(model,Tokenizer,text):
    seq = Tokenizer.texts_to_sequences([text])
    seq = np.array(seq)
    preds = np.argmax(model.predict(seq))
    for key, value in Tokenizer.word_index.items():
        if value==preds:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word

In [105]:
predict_next_word(model,Tokenizer,"Radiohead are an")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
cataloguesand


'cataloguesand'