In [1]:
from __future__ import print_function
import pandas as pd
from keras.models import Model, save_model, load_model
from keras.layers import Input, LSTM, Dense
import numpy as np
import re
from pickle import dump, load
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import seaborn as sns
import math

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arnavgarg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


### Importing Dataset

In [3]:
#https://github.com/mahnazkoupaee/WikiHow-Dataset
wikihow = pd.read_csv('wikihowSep.csv')
wikihow = wikihow.dropna()

In [4]:
wikihow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1383743 entries, 0 to 1585694
Data columns (total 5 columns):
overview        1383743 non-null object
headline        1383743 non-null object
text            1383743 non-null object
sectionLabel    1383743 non-null object
title           1383743 non-null object
dtypes: object(5)
memory usage: 63.3+ MB


### Attribute Descriptions
**Title** - the title of the article as it appears on the WikiHow knowledge base <br/>
**Overview** - the introduction section of the WikiHow articles represented before the paragraphs corresponding to procedures <br/>
**Headline** - the bold line (the summary sentence) of the paragraph to serve as the reference summary <br/>
**Text** - the paragraph (except the bold line) to generate the article to be summarized <br/>

In [5]:
wikihow = wikihow.drop(['title','overview','sectionLabel'],1)

In [6]:
wikihow = wikihow.rename(index=str, columns={'headline': "summary"})
wikihow.head(10)

Unnamed: 0,summary,text
0,\nSell yourself first.,"Before doing anything else, stop and sum up y..."
1,\nRead the classics before 1600.,Reading the classics is the very first thing ...
2,\nJoin online artist communities.,Depending on what scale you intend to sell yo...
3,\nMake yourself public.,Get yourself out there as best as you can by ...
4,\nBlog about your artwork.,"Given the hundreds of free blogging websites,..."
5,\nCreate a mailing list.,This could be your most effective tool if man...
6,\nTake good pictures.,"Like they say, ""a picture's worth a thousand ..."
7,\nBe sure to properly license your art.,Licensing art is a way of proving what belong...
8,\nConsider the option of creating your own site.,Having your own site means that you can optim...
9,\nExpect this to be a gradual process and don'...,An online art business needs to be built up l...


In [7]:
wikihow['text'][50]

' It might look like a lot, thus why it\'s very important to look at the seating chart. This will give you a preview of what specific seat you are paying for, the specific price for it, etc. You need to know that every ticket reviewed as a "timer" in the bottom right hand corner. This is the amount of time you have to make a decision if you want the seat(s) or change your mind. You can either "Continue" on to purchase it if you\'re satisfied or "Search Again". Searching again will then bring you back to the main ticket page for the event and place your old review back into the queue for anyone else to grab.\n\n\nKnow that when the general public is allowed to purchase tickets and depending on the demand/popularity of the event, you may/may not receive a better ticket than the last, so be careful how much time you spend on each ticket review.\n\n'

In [8]:
wikihow['summary'][50]

'\nReview the seating ticket on the next screen.'

### Text Processing

In [9]:
contractions = { 
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "I'd": "I had / I would",
    "I'd've": "I would have",
    "I'll": "I shall / I will",
    "I'll've": "I shall have / I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
}

In [10]:
stops = set(stopwords.words("english"))
def clean_text(text, remove_stopwords=True):
    #Convert word to lower case
    text = text.lower()
    text = text.split()
    new_text = []
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    #Getting rid of any potential html tags
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', '', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    if remove_stopwords:
        text = text.split()
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    return text

### Clean Text and Summaries 

In [11]:
clean_texts = []
print("Beginning to clean text samples...")
for text in wikihow.text:
    clean_texts.append(clean_text(text, remove_stopwords=False))
print("Cleaned text extracts!")

Beginning to clean text samples...
Cleaned text extracts!


In [12]:
clean_summaries = []
print("Beginning to clean summaries...")
for summary in wikihow.summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Cleaned summaries!")

Beginning to clean summaries...
Cleaned summaries!


In [13]:
wikihow.text[0]

" Before doing anything else, stop and sum up yourself as an artist. Now, think about how to translate that to an online profile. Be it the few words, Twitter allows you or an entire page of indulgence that your own website would allow you. Bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. Make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. If you're not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.;\n"

In [14]:
clean_texts[0]

'before doing anything else stop and sum up yourself as an artist now think about how to translate that to an online profile be it the few words twitter allows you or an entire page of indulgence that your own website would allow you bring out the most salient features of your creativity your experience your passion and your reasons for painting make it clear to readers why you are an artist who loves art produces high quality art and is a true champion of art if you are not great with words find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability'

#### Save processed text and summaries 

In [15]:
data = list()
for i,text in enumerate(clean_texts):
    data.append({'text': text, 'summary': clean_summaries[i]})
dump(data, open('processed_dataset.pkl', 'wb'))

### Load saved processed data

In [16]:
data = load(open('processed_dataset.pkl', 'rb'))
print('Loaded {} samples from processed_dataset.pkl'.format(len(data)))
print(type(data))

Loaded 1383743 samples from processed_dataset.pkl
<class 'list'>


### Vectorizing data

In [17]:
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [18]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
for obj in data[:min(num_samples, len(data)-1)]:
    input_text = obj['text']
    #\t serves as starting sequence character and \n is ending sequence character
    target_text = '\t' + obj['summary'] + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [19]:
list(input_characters)[60:70]

['s', '–', '®', '⅛', 'j', '“', 'a', 'g', 'v', ' ']

In [20]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
encoder_seq_length = [len(txt) for txt in input_texts]
decoder_seq_length = [len(txt) for txt in target_texts]
max_encoder_seq_length = max(encoder_seq_length)
max_decoder_seq_length = max(decoder_seq_length)

In [21]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 75
Number of unique output tokens: 50
Max sequence length for inputs: 3805
Max sequence length for outputs: 1237


In [None]:
length_of_extracts = [len(txt) for txt in input_texts][0:1000]
counts = list(range(0, len(input_texts)))[0:1000]
sns.set()
sns.barplot(x=counts, y=length_of_extracts, palette='rocket')

### Model Construction

In [None]:
def create_models(n_input, n_output, n_units):
    # define training encoder
    encoder_inputs = Input(shape=(None, n_input))
    encoder = LSTM(n_units, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]
    # define training decoder
    decoder_inputs = Input(shape=(None, n_output))
    decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(n_output, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    # define inference encoder
    encoder_model = Model(encoder_inputs, encoder_states)
    # define inference decoder
    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,  initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
    # return all models
    return model, encoder_model, decoder_model

In [None]:
model, encoder_model, decoder_model = create_models(num_encoder_tokens, num_decoder_tokens, latent_dim)

In [None]:
average_input_length = math.floor(sum(encoder_seq_length)/len(encoder_seq_length))
average_input_length

### Training 

In [None]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [None]:
save_model(model, 'summarizer.h5')

In [None]:
#model = load_model('summarizer.h5')

### Inference 

In [None]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.
        # Update states
        states_value = [h, c]
    return decoded_sentence

In [None]:
#See output predictions for unseen data
for seq_index in range(100000,100050):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)