<a href="https://colab.research.google.com/github/sayarghoshroy/Summarization/blob/master/summarization_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pickle
import spacy
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neural_network import MLPRegressor as mlp

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# change to path to dataset

## for fasttext embeddings use this
#file_name = "/content/drive/My Drive/NLA project/fasttext.pkl"

## for infersent embeddings use this
file_name = "/content/drive/My Drive/NLA project/infersent_emb.pkl"

stories = pickle.load(open(file_name, 'rb'))

In [0]:
# basic embeddings using averaged glove vectors
# using Spacy's large language model
def get_embedding(text):
    extract = embedder(text)
    total_sum = np.zeros(300)
    count = 0
    for token in extract:
        count += 1
        total_sum += np.asarray(token.vector)
    return total_sum / count

In [0]:
EMB = 'infersent'
#EMB = 'fasttext_emb'

In [0]:
def get_document_embedding(data):
  ## for infersent use 4096, for fasttext and glove use 300
  num_dimensions = 4096
  total_sum = np.zeros(num_dimensions)
  for vector in data[EMB]:
    total_sum += np.asarray(vector)
  return total_sum/len(data[EMB])  


In [0]:
# creating the inputs and expected outputs
X_train = []
y_train = []
count = 0
for data in stories:
    count += 1
    doc_emb = get_document_embedding(data)
    # use the function of choice to generate the document embedding

    index = 0
    for sentence in data[EMB]:
        sent_emb = sentence
        # use the function of choice to generate the sentence embedding

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 
        index += 1

        X_train.append(x)
        y_train.append(y)

    if count > 100:
        break

X_train = np.asmatrix(X_train)
y_train = np.asarray(y_train)

In [0]:
def train(X, y):
    model = mlp(hidden_layer_sizes = (2048, 2048, 1024, 512, 256), max_iter = 100)
    model.fit(X, y)
    return model

def get_values(X, model):
    return model.predict(X)

In [0]:
m = train(X_train, 1000 * y_train)

In [0]:
filename = 'infersent_model.sav'
pickle.dump(m, open(filename, 'wb'))

In [0]:
# Hyperparameter for similarity threshold
theta = 0.95

def similarity(A, B):
    similarity =  (A @ B.T) / (np.linalg.norm(A) * np.linalg.norm(B))
    return similarity

def get_top_num(X_doc, y, num):
    order = np.flip(np.argsort(y))
    sentence_set = []
    for sent_id in order:
        if sentence_set == []:
            sentence_set.append(order[0])
            continue

        consider = X_doc[sent_id, :]
        flag = 1
        for consider_id in sentence_set:
            if similarity(X_doc[consider_id, :], consider) > theta:
                flag = 0
                break

        if flag == 1:
            sentence_set.append(sent_id)
    return sentence_set[0: min(num, len(sentence_set))]

In [34]:
# evaluation
# testing out each document iteratively
# test set: document 950 onwards

doc_id = 955
doc_count = len(stories)

# set the number of documents for testing
limit = 965

while doc_id < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[doc_id]
    doc_emb = get_document_embedding(data)

    index = 0
    for sentence in data[EMB]:
        sent_emb = sentence

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, m)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    # Uncomment to view the test_loss on the sample  
    # print(loss)

    print("Document ID:", doc_id, ", Top 5 Sentences:", get_top_num(X_doc, sentence_predicted_scores,5))

    # Uncomment to view the top 10 sentences based on Gold Labels
    print("Top 10 sentences based on Gold Label", np.flip(np.argsort(y_doc))[0:10])
    doc_id += 1

Document ID: 955 , Top 5 Sentences: [6, 3, 2, 11, 10]
Top 10 sentences based on Gold Label [ 1 13  0  5 12  3 14 11 10  9]
Document ID: 956 , Top 5 Sentences: [4, 0, 1, 7, 5]
Top 10 sentences based on Gold Label [1 0 6 4 3 5 7 2]
Document ID: 957 , Top 5 Sentences: [3, 2, 8, 12, 26]
Top 10 sentences based on Gold Label [ 3 18 26  4 23  2  0 22  6 19]
Document ID: 958 , Top 5 Sentences: [0, 6, 4, 9, 8]
Top 10 sentences based on Gold Label [0 8 9 7 1 4 6 5 3 2]
Document ID: 959 , Top 5 Sentences: [7, 3, 1, 6, 0]
Top 10 sentences based on Gold Label [4 1 6 2 5 7 3 0]
Document ID: 960 , Top 5 Sentences: [9, 8, 14, 7, 12]
Top 10 sentences based on Gold Label [ 1  2  9 17  7 10  0  8 16 11]
Document ID: 961 , Top 5 Sentences: [2, 0, 5, 3, 1]
Top 10 sentences based on Gold Label [2 0 5 3 4 1]
Document ID: 962 , Top 5 Sentences: [5, 0, 4, 2, 3]
Top 10 sentences based on Gold Label [2 3 7 0 8 6 5 4 1]
Document ID: 963 , Top 5 Sentences: [21, 35, 33, 28, 5]
Top 10 sentences based on Gold Label [

In [0]:
# ^_^ Thank You

In [0]:
loaded_model = pickle.load(open('infersent_model.sav', 'rb'))

In [73]:
doc_id = 955
doc_count = len(stories)

# set the number of documents for testing
limit = 965

while doc_id < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[doc_id]
    doc_emb = get_document_embedding(data)

    index = 0
    for sentence in data[EMB]:
        sent_emb = sentence

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, loaded_model)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    # Uncomment to view the test_loss on the sample  
    # print(loss)

    print("Document ID:", doc_id, ", Top 5 Sentences:", get_top_num(X_doc, sentence_predicted_scores,3))

    # Uncomment to view the top 10 sentences based on Gold Labels
    print("Top 10 sentences based on Gold Label", np.flip(np.argsort(y_doc))[0:10])
    doc_id += 1

Document ID: 955 , Top 5 Sentences: [6, 3, 2]
Top 10 sentences based on Gold Label [ 1 13  0  5 12  3 14 11 10  9]
Document ID: 956 , Top 5 Sentences: [4, 0, 1]
Top 10 sentences based on Gold Label [1 0 6 4 3 5 7 2]
Document ID: 957 , Top 5 Sentences: [3, 2, 8]
Top 10 sentences based on Gold Label [ 3 18 26  4 23  2  0 22  6 19]
Document ID: 958 , Top 5 Sentences: [0, 6, 4]
Top 10 sentences based on Gold Label [0 8 9 7 1 4 6 5 3 2]
Document ID: 959 , Top 5 Sentences: [7, 3, 1]
Top 10 sentences based on Gold Label [4 1 6 2 5 7 3 0]
Document ID: 960 , Top 5 Sentences: [9, 8, 14]
Top 10 sentences based on Gold Label [ 1  2  9 17  7 10  0  8 16 11]
Document ID: 961 , Top 5 Sentences: [2, 0, 5]
Top 10 sentences based on Gold Label [2 0 5 3 4 1]
Document ID: 962 , Top 5 Sentences: [5, 0, 4]
Top 10 sentences based on Gold Label [2 3 7 0 8 6 5 4 1]
Document ID: 963 , Top 5 Sentences: [21, 35, 33]
Top 10 sentences based on Gold Label [14 20  1  0 31 13 22 21 19 18]
Document ID: 964 , Top 5 Sent

In [41]:
!pip install rouge

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0


In [0]:
from rouge import Rouge 

In [0]:
file_name = "/content/drive/My Drive/NLA project/sent2.pkl"

original_summaries = pickle.load(open(file_name, 'rb'))

In [75]:
av = 0
for data in original_summaries:
  av+=len(data["highlights"])
print(av/1000) 

3.558


In [0]:
gold_summaries = []
for data in original_summaries:
  summary = " ".join(data["highlights"])
  gold_summaries.append(summary)

In [93]:
gold_summaries[0]

"haleh esfandiari says she read, walked, wrote a book in her mind while in prison scholar arrived home thursday after iran forbade her to leave for eight months iranian government never said why they released esfandiari from jail last month wilson center: ayatollah's letter marked first-ever response to american leader"

In [0]:
predicted_summaries = []
exclude = []

doc_id = 0
doc_count = len(stories)

limit = doc_count

while doc_id < min(doc_count, limit):
    try:
      X_doc = []
      y_doc = []
      data = stories[doc_id]
      doc_emb = get_document_embedding(data)

      index = 0
      for sentence in data[EMB]:
          sent_emb = sentence

          x = np.concatenate((sent_emb, doc_emb))
          y = data['scores'][index] 

          index += 1

          X_doc.append(x)
          y_doc.append(y)

      X_doc = np.asmatrix(X_doc)
      y_doc = np.asarray(y_doc)

      sentence_predicted_scores = get_values(X_doc, loaded_model)

      loss = np.linalg.norm(sentence_predicted_scores - y_doc)

      # Uncomment to view the test_loss on the sample  
      # print(loss)

      ids = get_top_num(X_doc, sentence_predicted_scores,4)
      summary = ''
      for idx in ids:
        summary += original_summaries[doc_id]['story'][idx]
      predicted_summaries.append(summary)  
    except:
      exclude.append(doc_id)
    doc_id+=1


  

In [0]:
for ind in exclude:
  del gold_summaries[ind]

In [96]:
print(len(gold_summaries),len(predicted_summaries))

998 998


In [0]:
rouge = Rouge()
scores = []
for i in range(0,len(gold_summaries)):
  scores.append(rouge.get_scores(predicted_summaries[i], gold_summaries[i]))

In [0]:
average_rouge = {
    'rouge_1':{'f' : 0, 'p' : 0, 'r' : 0},
    'rouge_2':{'f' : 0, 'p' : 0, 'r' : 0},
    'rouge_l':{'f' : 0, 'p' : 0, 'r' : 0},
}
for sc in scores:
  score = sc[0]
  average_rouge['rouge_1']['f'] += score['rouge-1']['f']
  average_rouge['rouge_1']['p'] += score['rouge-1']['p']
  average_rouge['rouge_1']['r'] += score['rouge-1']['r']
  average_rouge['rouge_2']['f'] += score['rouge-2']['f']
  average_rouge['rouge_2']['p'] += score['rouge-2']['p']
  average_rouge['rouge_2']['r'] += score['rouge-2']['r']
  average_rouge['rouge_l']['f'] += score['rouge-l']['f']
  average_rouge['rouge_l']['p'] += score['rouge-l']['p']
  average_rouge['rouge_l']['r'] += score['rouge-l']['r']

average_rouge['rouge_1']['f'] /= len(scores)
average_rouge['rouge_1']['p'] /= len(scores)
average_rouge['rouge_1']['r'] /= len(scores)
average_rouge['rouge_2']['f'] /= len(scores)
average_rouge['rouge_2']['p'] /= len(scores)
average_rouge['rouge_2']['r'] /= len(scores)
average_rouge['rouge_l']['f'] /= len(scores)
average_rouge['rouge_l']['p'] /= len(scores)
average_rouge['rouge_l']['r'] /= len(scores)

In [70]:
print("For top five sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------") 

For top five sentences
rouge_1
f	0.21218549949989918
p	0.13743780821078694
r	0.514433460338619
--------------------------
rouge_2
f	0.07780463190895977
p	0.05039379155868287
r	0.18894813470729832
--------------------------
rouge_l
f	0.20578917643633968
p	0.13901717381078943
r	0.427076310748189
--------------------------


In [99]:
print("For top four sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------") 

For top four sentences
rouge_1
f	0.2263082742007271
p	0.15504706773578525
r	0.4615344756485788
--------------------------
rouge_2
f	0.07889476867782748
p	0.05397341665789107
r	0.16101318569672227
--------------------------
rouge_l
f	0.2087232440138955
p	0.14863005286219036
r	0.3777621191330641
--------------------------


In [91]:
print("For top three sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------")

For top three sentences
rouge_1
f	0.23837762616307634
p	0.17870970342922088
r	0.3948210652772677
--------------------------
rouge_2
f	0.0782952265255066
p	0.0586625199432189
r	0.1290943302874871
--------------------------
rouge_l
f	0.20978198592978672
p	0.16219584658887584
r	0.32059361189593455
--------------------------
