Installing Metaphor library

In [None]:
!pip install metaphor-python

# Text Summarization 
Text Summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.

### WikiHow
is a new large-scale dataset using the online WikiHow (http://www.wikihow.com) knowledge base. It contains ~2000 articles which have title, text, and headlines(summary).

I have created Attention Based GRU model along with the implementation of teacher forcing and dot product attention mechanism.

After training the dataset for 150000 steps I tested the dataset that I created from the text I obtained using METAPHOR's api. I have stored the text and its predicted summary.


In [3]:
#importing wikiHow dataset
import pandas as pd
df = pd.read_csv("wikihowAll.csv")

In [4]:
import numpy as np
import os
import re
import warnings
warnings.filterwarnings("ignore")
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = df.astype(str)

In [6]:
df.shape

(215365, 3)

In [7]:
df = df[df['text'].isnull()==False]
df = df[df['headline'].isnull()==False]
df = df[df['text']!="nan"]
df = df[df['headline']!="nan"]
df.drop_duplicates(inplace=True)

In [8]:
df.shape

(214294, 3)

In [9]:
from stop_words import get_stop_words

In [10]:
stop_words = set(get_stop_words('en')) 

def text_cleaner(text,num):
  str = text.lower()
  str = BeautifulSoup(str, features="lxml").text
  str = re.sub(r'\([^)]*\)', '', str)
  str = re.sub('"','', str)
  str = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in str.split(" ")])    
  str = re.sub(r"'s\b","",str)
  str = re.sub("[^a-zA-Z]", " ", str) 
  str = re.sub('[m]{2,}', 'mm', str)
  if(num==0):
    str = re.sub(r'\.',' . ',str)
  if(num==0):
      tokens = [w for w in str.split() if not w in stop_words]
      
  else:
      tokens=str.split()
  long_words=[]
  for i in tokens:
      if len(i)>1:          #removing short words
          long_words.append(i)
  return (" ".join(long_words)).strip()

In [11]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not","didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not","he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is","I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would","i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would","it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam","mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have","mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have","she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is","should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as","this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would","there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have","they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have","wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are","we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are","what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is","where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have","why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have","would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have","you're": "you are", "you've": "you have"}

In [14]:
#clean the dataset
clean_text = []
for t in df['text']:
    clean_text.append(text_cleaner(t,0))

In [12]:
#clean the dataset
clean_summary = []
for t in df['headline']:
    clean_summary.append(text_cleaner(t,0))

In [13]:
df['text']=clean_text
df['headline']=clean_summary
# Droping the null rows
df.replace('', np.nan, inplace=True)
df.dropna(axis=0,inplace=True)

In [14]:
# From the graph
# We can fix maximum length of text = 150 since most of the reviews have a length of 150 and maximum headline length of 50, since maximum headlines are of size 50
max_len_text= 150
max_len_headline=50

In [16]:
text1 =np.array(df['text'])
headline1=np.array(df['headline'])

short_text=[]
short_summary=[]

for i in range(len(text1)):
    if(len(headline1[i].split())<=max_len_text and len(text1[i].split())<=max_len_headline):
        short_text.append(text1[i])
        short_summary.append(headline1[i])
        
df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [18]:
print(df['text'][50],df['summary'][50],sep='\n')

even just split seconds will put guy little pickle asked likes girl even know try friend looking knock one books desk right front guy step away quickly jerk will pick now will know talking ask likes friend stalk showering questions will probably little freaked turn lunch go sit table friends one ask reply give time ask next week might little embarrassed asked front friends might laugh something ask see private make sure sound casual otherwise will think like guys will say know maybe gotta think press much just try get definite answer says yes tell everyone school humongous turn knows trying set two doesn say anything even know setting make sure goes talks arrange date humbly take million thank yous gives
get friend dream guy notice casual try ask private yes don yell everyone says tell friend says yes tell friend


In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df['text'],df['summary'],test_size=0.2,random_state=0,shuffle=True) 

In [20]:
print(len(x_train))
print(len(x_test))

65369
16343


In [21]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [23]:
def readLangs(text, summary, reverse=False):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    text=np.array(text)
    summary=np.array(summary)
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(summary)
        output_lang = Lang(text)
    else:
        input_lang = Lang(text)
        output_lang = Lang(summary)

    return input_lang, output_lang, pairs

In [24]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [25]:
input_lang, output_lang, pairs = prepareData(x_train, y_train , False)
print(random.choice(pairs))

Reading lines...
Read 65369 sentence pairs
Counting words...
Counted words:
['want break scary illusion atmosphere either play day keep bright lights night sound basically half scariness atmosphere game read faq guide panic fighting solving puzzles watch videos game footage can expect coming generally merrier will makes feel better calm'
 'can find settings app one home screens may folder labeled utilities will see fifth group options account set default notes account notes created siri will added account notes displayed notes widget today view will account well can add additional accounts iphone support notes will able sync notes notes app can find fifth group options will see list accounts already added iphone previously added account enable notes can tap enable notes switch will display list accounts can add account listed tap add manually accounts will just need enter email address password iphone will attempt connect account using login information provided switch enabled will abl

In [29]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [30]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [31]:
MAX_LENGTH = 150

In [32]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [33]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [34]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [35]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [36]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [55]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    print("Training....")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(84000, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        input_length = input_tensor.size(0)
        if(input_length > 150):
          #print(input_length)
          continue

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))
            torch.save(encoder, "encoder"+str(iter)+".pt")
            torch.save(decoder, "decoder"+str(iter)+".pt")
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [38]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


In [48]:
enc = torch.load("encoder84000.pt")
dec = torch.load("decoder84000.pt")

In [None]:
hidden_size = 300
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

In [56]:
trainIters(enc, dec, 150000, print_every=1000)

Training....
84000 / 150001
0m 12s (- 0m 9s) (84000 56%) 0.0092
85000 / 150001
2m 9s (- 1m 38s) (85000 56%) 5.2804
86000 / 150001
4m 7s (- 3m 4s) (86000 57%) 5.3114
87000 / 150001
6m 6s (- 4m 25s) (87000 57%) 5.2891
88000 / 150001
8m 3s (- 5m 40s) (88000 58%) 5.3175
89000 / 150001
9m 59s (- 6m 51s) (89000 59%) 5.3471
90000 / 150001
11m 59s (- 7m 59s) (90000 60%) 5.1097
91000 / 150001
13m 56s (- 9m 2s) (91000 60%) 5.2207
92000 / 150001
15m 55s (- 10m 2s) (92000 61%) 5.1956
93000 / 150001
17m 56s (- 10m 59s) (93000 62%) 5.2913
94000 / 150001
19m 56s (- 11m 53s) (94000 62%) 5.1533
95000 / 150001
21m 55s (- 12m 41s) (95000 63%) 5.4039
96000 / 150001
23m 56s (- 13m 28s) (96000 64%) 5.2610
97000 / 150001
25m 54s (- 14m 9s) (97000 64%) 5.3113
98000 / 150001
27m 56s (- 14m 49s) (98000 65%) 5.3845
99000 / 150001
29m 54s (- 15m 24s) (99000 66%) 5.2648
100000 / 150001
31m 52s (- 15m 56s) (100000 66%) 5.2637
101000 / 150001
33m 52s (- 16m 26s) (101000 67%) 5.2289
102000 / 150001
35m 51s (- 16m 52s

In [73]:
torch.save(encoder1, "encoder.pt")
torch.save(attn_decoder1, "decoder.pt")


In [75]:
def text_cleaner_metaphor(text,num):
  str = re.sub("[^a-zA-Z.]", " ", text)  
  #str = re.sub("div", " ", text)  
  tokens=str.split()
    
  long_words=[]
  for i in tokens:
      if(i=="div"):
          continue
      if len(i)>1:          #removing short words
          long_words.append(i)
  return (" ".join(long_words)).strip()

In [13]:
from metaphor_python import Metaphor

In [14]:
metaphor = Metaphor("23c84584-6ebc-4159-9fa1-b3cebf43a8a1")

In [15]:
response = metaphor.search("iphone 12 reviews", num_results = 10, use_autoprompt=True)

In [16]:
res=response.get_contents()

In [None]:
ress = res.contents[0]

In [76]:
import random
clean_text = []
for i in range(500): #maximum requests
    index = random.randrange(20000)
    title = df["title"][index]
    response = metaphor.search(title, num_results = 10, use_autoprompt=True,)
    t = response.get_contents().contents[1].extract
    clean=text_cleaner(t,0)
    clean_text.append(clean)

Exception: Request failed with status code 429. Message: {"error":"API key usage limit reached: 1009/1000"}

In [80]:
pred_df_GRU = pd.DataFrame()
pred_df_GRU["text"] = clean_text

In [None]:
pred_summary=[]
for i in range(len(pred_df_GRU["text"])):
    output_words, attentions = evaluate(enc, dec,pred_df_GRU["text"][i])
    pred_summary+=[output_words]

In [84]:
pred_df_GRU["pred_summary"] = pred_summary

Unnamed: 0,text
0,As partner to woman in labor your biggest role...
1,href https www.trustpilot.com review yeastinfe...
2,The first day of kindergarten is busy busy bus...
3,strong Learn How to Get Better at Soccer with ...
4,article How to relieve tattoo pain or how to m...


In [92]:
pred_df_GRU

Unnamed: 0,text,pred_summary
0,As partner to woman in labor your biggest role...,As partner to woman in labor your biggest role...
1,href https www.trustpilot.com review yeastinfe...,em What is the best yeast infection prevention...
2,The first day of kindergarten is busy busy bus...,The first day of kindergarten is busy busy bus...
3,strong Learn How to Get Better at Soccer with ...,
4,article How to relieve tattoo pain or how to m...,article How to relieve tattoo pain or how to m...
...,...,...
70,DIY Cat Condo href http .bp.blogspot.com lZoVJ...,DIY Cat Condo href http .bp.blogspot.com lZoVJ...
71,Vagina is one of the organs in woman body char...,Vagina is one of the organs in woman body char...
72,main section header header article figure figc...,main section header header article figure figc...
73,So you want to know how to tell if guy likes y...,So you want to know how to tell if guy likes y...


In [83]:
pred_df_GRU.to_csv("pred_df.csv")

In [93]:
html_table = pred_df_GRU.to_html(index=False)

# Save the HTML table to a file
with open('my-table.html', 'w') as f:
    f.write(html_table)