In [None]:
# LSTM CODE BY https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, CuDNNLSTM

In [2]:
FILE_LOCATION = "https://raw.githubusercontent.com/alexandre-lavoie/youtube-bot/master/data/US_viewCount.csv"
MIN_TITLE_LENGTH = 31
NUMBER_OF_DESCRIPTIONS = 10
DESCRIPTION_LENGTH = 500
EPOCHS = 20
BATCH_SIZE = 128

In [3]:
video_database = pd.read_csv(FILE_LOCATION)

In [4]:
cleanup_database = video_database.replace('[^\x00-\x7F]+','',regex=True)
cleanup_database = cleanup_database.replace('(http|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', 'URL', regex=True)
cleanup_database = cleanup_database.replace(' \.\.\.', '', regex=True)
description_database = [description for description in cleanup_database["description"] if isinstance(description,str)]

In [5]:
chars = [chr(i) for i in range(ord('!'), ord('Z')+1)]
chars.extend(['|', ' ', '[', ']', '_', '~'])
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
n_vocabs = len(chars)

In [6]:
description_int = []
for description in description_database:
    description_int.append ([char_to_int[letter] for letter in description.upper()])

In [7]:
dataX = []
dataY = []

for description in description_int:
    for i in range(0, len(description)-MIN_TITLE_LENGTH):
        dataX.append(description[i:(i+MIN_TITLE_LENGTH)])
        dataY.append(description[i+MIN_TITLE_LENGTH])
        
n_patterns = len(dataX)

In [8]:
print("Vocab Length: " + str(n_vocabs))
print("Pattern Length: " + str(MIN_TITLE_LENGTH))
print("Number of patterns: " + str(n_patterns))

Vocab Length: 64
Pattern Length: 31
Number of patterns: 198929


In [9]:
X = np.reshape(dataX, (n_patterns, MIN_TITLE_LENGTH, 1))
X = X/float(n_vocabs)
y = np.eye(len(chars))[dataY]

In [10]:
model = Sequential()
model.add(CuDNNLSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [11]:
model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c1166d8>

In [12]:
for _ in range(NUMBER_OF_DESCRIPTIONS):
    start = np.random.randint(0, len(dataX)-1)
    patt = dataX[start]
    text = ""
    for value in patt:
        text += int_to_char[value]

    for i in range(DESCRIPTION_LENGTH):
        xx = np.reshape(patt, (1, len(patt), 1))
        xx = xx / float(n_vocabs)
        prediction = model.predict(xx, verbose=0)
        index = np.argmax(prediction)
        text += int_to_char[index]
        patt.append(index)
        patt = patt[1:len(patt)]

    print(text)

 2019 | SUBSCRIBE  URL | JASON SUDEIKIS MOVIE TRAILER | RELEASE: 16 AUG 2019 | MORE - HO NOON BYHALS INOS: PRL INST LRONEC @PD EATE ANM AND HERED LRLD CARS RREMELELE, THENN LIKKA PIN OFFICIAL  TUBSCRIBE FOR MORE VIDEOS: URL #PEPPA #PEPPAPIGGIEEEEEEEBB.BOMMU MRODB VRLES BROE SNE ERAZIT FOEANEEDST, INOMOUAN, (2) PINK PONAHAR -MEEE OARIR" DRTNOO TT SSEPI? PR LEFES NESEH AUDANIBL ANB SQIDLE TIT INR HALOE TO TEE TFINES EOR TEE LATSR. IN WHE SALKES  OR DENLNETS ON AALKETEE CIMMAT OOSA PRLDUCE CONMA IOSEL,WLEARNNENY ANEUSEFEG (VAVB 
S HEATER BREAKS, PROMPTING BIG NOSE TO ATTEMPT TO STEAL PINK PANTHER'S HEATER. (2) A PINKER TOMORROWOTOSOME INON THE OAWTRETE OOOTINS, CNONE STACLEL SADUNE TEAKE, TAECHONA THE AILUELSE PIE ALD TNANE SF HLALIT LISED UALKN TETT ES WIT AIC TT AACK GURL THE OATTEE TYOE TURECMED LO WHA  AREZSO GET PHEAREDLL, THEN TOIN AMAZINLLLA PIATES B SESTLES IN SPATEH AVESIABLE OU IT BACK TETHON TO CIL FTIRNTATTES A DOLM ARD OUBSPIME AOD COGLNEAN SI HU TEE ALAZESS HOL SAINNED HOARE