## Title generation

In [48]:
import pandas as pd
import string
import numpy as np
import json

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
import random

import tensorflow as tf
tf.random.set_seed(2)
from numpy.random import seed
seed(1)



In [2]:
import gensim
from gensim.summarization import summarize
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from collections import Counter

In [3]:
#load the datasets that mainly have english titles
df1 = pd.read_csv('archive/USvideos.csv')
df2 = pd.read_csv('archive/CAvideos.csv')
df3 = pd.read_csv('archive/GBvideos.csv')

#load the datasets containing the category names
data1 = json.load(open('archive/US_category_id.json'))
data2 = json.load(open('archive/CA_category_id.json'))
data3 = json.load(open('archive/GB_category_id.json'))


In [9]:
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category

#create a new category column by mapping the category names to their id
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

#join the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

#drop rows based on duplicate videos
df = df.drop_duplicates('video_id')

#remove punctuations and convert text to lowercase
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text



In [10]:
#collect only titles of entertainment videos
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()

corpus_entertainment = [clean_text(e) for e in entertainment]

news = df[df['category_title'] == 'News & Politics']['title']
news = news.tolist()

corpus_news = [clean_text(e) for e in news]

In [8]:
df['category_title'].value_counts()


Entertainment            9730
News & Politics          3415
People & Blogs           3071
Music                    2479
Sports                   2422
Comedy                   2305
Howto & Style            1780
Film & Animation         1431
Gaming                    966
Science & Technology      900
Education                 763
Pets & Animals            325
Autos & Vehicles          310
Travel & Events           247
Shows                     106
Nonprofits & Activism      14
Movies                      1
Name: category_title, dtype: int64

In [15]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
  #get tokens
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    #convert to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    return input_sequences, total_words


In [16]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,  maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1], input_sequences[:, -1]
    label = ku.np_utils.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_len


In [20]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
 
  # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
 
  # Add Hidden Layer 1 — LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
 
  # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
 
    return model


In [21]:
#Model for entertainment
inp_sequences, total_words = get_sequence_of_tokens(corpus_entertainment)
predictors, label, max_sequence_len_entertainment = generate_padded_sequences(inp_sequences)
model_entertainment = create_model(max_sequence_len_entertainment, total_words)
model_entertainment.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2017539cb80>

In [22]:
#Model for news and politics
inp_sequences, total_words = get_sequence_of_tokens(corpus_news)
predictors, label, max_sequence_len_news = generate_padded_sequences(inp_sequences)
model_news = create_model(max_sequence_len_news, total_words)
model_news.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x203561479a0>

In [23]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1,  padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [27]:
df_entertainment = df[df['category_title'] == 'Entertainment']
df_news = df[df['category_title'] == 'News & Politics']

In [28]:
stopWords = set(stopwords.words("english")) 

In [41]:
def get_top_frequecy(text):
    URLless_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    words = word_tokenize(URLless_string) 
    freqTable = {}
    for word in words: 
        word = word.lower() 
        if word in stopWords: 
            continue
        if '-' in word or '\'' in word or  '&' in word or '?' in word or '’' in word or '#' in word or '\\' in word or ';' in word or '(' in word or ')' in word or ':' in word or '!' in word or '.' in word or ',' in word or '|' in word:
            continue
        if word in freqTable: 
            freqTable[word] += 1
        else: 
            freqTable[word] = 1
    k = Counter(freqTable)
    max_key =  k.most_common(3)
    return max_key

In [83]:
row_num = 4
des = df_entertainment.loc[row_num]['description']
title = df_entertainment.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))
print()
print('When first word is provided:')
print(generate_text('Bald', random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))

Actual title: I Dare You: GOING BALD!?

Model Generate 3 Titles:

Know At To A Whatcha And A 1St Nhn
Since Rich Gi Episode Graham Hum
Show In His Visit On Month The Jessica You

When first word is provided:
Bald Hoon A Goes To Tmz


In [84]:
row_num = 75
des = df_entertainment.loc[row_num]['description']
title = df_entertainment.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))
print()
print('When first word is provided:')
print(generate_text('Trump', random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))

Actual title: Rosie O’Donnell On Donald Trump’s Hostility Toward Her | WWHL

Model Generate 3 Titles:

Bravo Sunday Twitter Engtha20180430 To Rockets
Wwhl Official History Minaj The The Last
Donald Khi Dial Explained Haders Voice Teach

When first word is provided:
Trump 10 Down Roadies Kard


In [86]:
row_num = 6
des = df_entertainment.loc[row_num]['description']
title = df_entertainment.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))
print()
print('When first word is provided:')
print(generate_text('Jeff', random.randint(3, 8), model_entertainment, max_sequence_len_entertainment))

Actual title: Roy Moore & Jeff Sessions Cold Open - SNL

Model Generate 3 Titles:

Snl 10 Football Teaser 10 With
Embattled Challenge To Les
Alabama Juju Virtue The Gets Meghan About The

When first word is provided:
Jeff Official Newshour 34 What The Last


In [None]:
9,13,28,41,51

In [95]:
row_num = 9
des = df_news.loc[row_num]['description']
title = df_news.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_news, max_sequence_len_news))
print()
print('When first word is provided:')
print(generate_text('robots', random.randint(3, 8), model_news, max_sequence_len_news))

Actual title: Why the rise of the robots won’t mean the end of work

Model Generate 3 Titles:

Work The Memo Is The
Time Live House National For Friday
Least Gonzalezs Live Trump National For Friday

When first word is provided:
Robots Is A Truth To The Queen Releasing


In [99]:
row_num = 41
des = df_news.loc[row_num]['description']
title = df_news.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_news, max_sequence_len_news))
print()
print('When first word is provided:')
print(generate_text('region', random.randint(3, 8), model_news, max_sequence_len_news))

Actual title: Iraq-Iran earthquake: Deadly tremor hits border region - BBC News

Model Generate 3 Titles:

Least Gonzalezs Live Trump National
Strong Franken On The White House Correspondents Dinner The
Earthquake Ansari Dad Mom Spend The The

When first word is provided:
Region Live With Drshahid Masood 05December2017 Nawaz Sharif Asif


In [101]:
row_num = 51
des = df_news.loc[row_num]['description']
title = df_news.loc[row_num]['title']
print(f'Actual title: {title}')
print()
print('Model Generate 3 Titles:\n')
top_three = get_top_frequecy(des)
for w in top_three:
    print(generate_text(w[0], random.randint(3, 8), model_news, max_sequence_len_news))
print()
print('When first word is provided:')
print(generate_text('Puerto', random.randint(3, 8), model_news, max_sequence_len_news))

Actual title: Lin-Manuel Miranda's next act: Helping rebuild Puerto Rico

Model Generate 3 Titles:

Cbs Conway Oliver Steps For The The World The
Sunday Jones Day For Larry Nassar In
Morning Joe January 14 2018 2018 President Trump Breaking

When first word is provided:
Puerto Farrow Is The The


References:

https://thecleverprogrammer.com/2020/10/05/title-generator-with-machine-learning/

https://www.machinelearningplus.com/nlp/text-summarization-approaches-nlp-example/