<a href="https://colab.research.google.com/github/sayarghoshroy/Summarization/blob/master/summarization_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pickle
import spacy
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neural_network import MLPRegressor as mlp

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# change to path to dataset
file_name = "/content/drive/My Drive/NLA project/fasttext.pkl"
stories = pickle.load(open(file_name, 'rb'))

In [0]:
# basic embeddings using averaged glove vectors
# using Spacy's large language model
def get_embedding(text):
    extract = embedder(text)
    total_sum = np.zeros(300)
    count = 0
    for token in extract:
        count += 1
        total_sum += np.asarray(token.vector)
    return total_sum / count

In [0]:
def get_document_embedding(data):
  total_sum = np.zeros(300)
  for vector in data["fasttext_emb"]:
    total_sum += np.asarray(vector)
  return total_sum/len(data["fasttext_emb"])  


In [0]:
# creating the inputs and expected outputs
X_train = []
y_train = []
count = 0
for data in stories:
    count += 1
    doc_emb = get_document_embedding(data)
    # use the function of choice to generate the document embedding

    index = 0
    for sentence in data['fasttext_emb']:
        sent_emb = sentence
        # use the function of choice to generate the sentence embedding

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 
        index += 1

        X_train.append(x)
        y_train.append(y)

    if count > 100:
        break

X_train = np.asmatrix(X_train)
y_train = np.asarray(y_train)

In [0]:
def train(X, y):
    model = mlp(hidden_layer_sizes = (1024, 2048, 1024, 512, 256), max_iter = 100)
    model.fit(X, y)
    return model

def get_values(X, model):
    return model.predict(X)

In [0]:
m = train(X_train, 1000 * y_train)

In [0]:
filename = 'fasttext_model.sav'
pickle.dump(m, open(filename, 'wb'))

In [0]:
# Hyperparameter for similarity threshold
theta = 0.95

def similarity(A, B):
    similarity =  (A @ B.T) / (np.linalg.norm(A) * np.linalg.norm(B))
    return similarity

def get_top_num(X_doc, y,num):
    order = np.flip(np.argsort(y))
    sentence_set = []
    for sent_id in order:
        if sentence_set == []:
            sentence_set.append(order[0])
            continue

        consider = X_doc[sent_id, :]
        flag = 1
        for consider_id in sentence_set:
            if similarity(X_doc[consider_id, :], consider) > theta:
                flag = 0
                break

        if flag == 1:
            sentence_set.append(sent_id)
    return sentence_set[0: min(num, len(sentence_set))]

In [11]:
# evaluation
# testing out each document iteratively
# test set: document 950 onwards

doc_id = 980
doc_count = len(stories)

# set the number of documents for testing
limit = 990

while doc_id < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[doc_id]
    doc_emb = get_document_embedding(data)

    index = 0
    for sentence in data['fasttext_emb']:
        sent_emb = sentence

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, m)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    # Uncomment to view the test_loss on the sample  
    # print(loss)

    print("Document ID:", doc_id, ", Top 5 Sentences:", get_top_num(X_doc, sentence_predicted_scores,3))

    # Uncomment to view the top 10 sentences based on Gold Labels
    print("Top 10 sentences based on Gold Label", np.flip(np.argsort(y_doc))[0:10])
    doc_id += 1

Document ID: 980 , Top 5 Sentences: [7, 3, 4, 11, 1]
Top 10 sentences based on Gold Label [12 11  1  0  5  2 14  4  6  7]
Document ID: 981 , Top 5 Sentences: [49, 48, 37, 12, 33]
Top 10 sentences based on Gold Label [ 2 25 14 17 31 13 22 21 20 19]
Document ID: 982 , Top 5 Sentences: [3, 9, 11, 2, 13]
Top 10 sentences based on Gold Label [ 7  1 19  4 11  0 16 12 17 20]
Document ID: 983 , Top 5 Sentences: [10, 3, 9, 7, 5]
Top 10 sentences based on Gold Label [ 9  4 10  1  7  0 13 12 11  8]
Document ID: 984 , Top 5 Sentences: [51, 38, 46, 50, 47]
Top 10 sentences based on Gold Label [ 1 54 57  0 53  2 27 13  3 16]
Document ID: 985 , Top 5 Sentences: [3, 13, 4, 6, 10]
Top 10 sentences based on Gold Label [ 6  0 15 14 13 12 11 10  9  8]
Document ID: 986 , Top 5 Sentences: [8, 14, 15, 13, 1]
Top 10 sentences based on Gold Label [ 0  1  2  4 12  9  3  5 15 14]
Document ID: 987 , Top 5 Sentences: [3, 34, 32, 37, 23]
Top 10 sentences based on Gold Label [15  0  2 26  8  5 11 17 16 14]
Document 

In [0]:
# ^_^ Thank You

In [0]:
loaded_model = pickle.load(open('fasttext_model.sav', 'rb'))

In [45]:
!pip install rouge



In [0]:
from rouge import Rouge 

In [0]:
file_name = "/content/drive/My Drive/NLA project/sent2.pkl"

original_summaries = pickle.load(open(file_name, 'rb'))

In [0]:
gold_summaries = []
for data in original_summaries:
  summary = " ".join(data["highlights"])
  gold_summaries.append(summary)

In [87]:
gold_summaries[0]

"haleh esfandiari says she read, walked, wrote a book in her mind while in prison scholar arrived home thursday after iran forbade her to leave for eight months iranian government never said why they released esfandiari from jail last month wilson center: ayatollah's letter marked first-ever response to american leader"

In [0]:
predicted_summaries = []
exclude = []

doc_id = 0
doc_count = len(stories)

limit = doc_count

while doc_id < min(doc_count, limit):
    try:
      X_doc = []
      y_doc = []
      data = stories[doc_id]
      doc_emb = get_document_embedding(data)

      index = 0
      for sentence in data["fasttext_emb"]:
          sent_emb = sentence

          x = np.concatenate((sent_emb, doc_emb))
          y = data['scores'][index] 

          index += 1

          X_doc.append(x)
          y_doc.append(y)

      X_doc = np.asmatrix(X_doc)
      y_doc = np.asarray(y_doc)

      sentence_predicted_scores = get_values(X_doc, loaded_model)

      loss = np.linalg.norm(sentence_predicted_scores - y_doc)

      # Uncomment to view the test_loss on the sample  
      # print(loss)

      ids = get_top_num(X_doc, sentence_predicted_scores,4)
      summary = ''
      for idx in ids:
        summary += original_summaries[doc_id]['story'][idx]
      predicted_summaries.append(summary) 
    except:
      exclude.append(doc_id)

    doc_id+=1

In [107]:
exclude

[269, 691]

In [0]:
for ind in exclude:
  del gold_summaries[ind]

In [113]:
print(len(gold_summaries),len(predicted_summaries))

998 998


In [0]:
rouge = Rouge()
scores = []
for i in range(0,len(gold_summaries)):
  scores.append(rouge.get_scores(predicted_summaries[i], gold_summaries[i]))

In [115]:
scores[0][0]

{'rouge-1': {'f': 0.2087912048061829, 'p': 0.14393939393939395, 'r': 0.38},
 'rouge-2': {'f': 0.05555555159321016,
  'p': 0.03816793893129771,
  'r': 0.10204081632653061},
 'rouge-l': {'f': 0.17391303903381655,
  'p': 0.13043478260869565,
  'r': 0.2608695652173913}}

In [0]:
average_rouge = {
    'rouge_1':{'f' : 0, 'p' : 0, 'r' : 0},
    'rouge_2':{'f' : 0, 'p' : 0, 'r' : 0},
    'rouge_l':{'f' : 0, 'p' : 0, 'r' : 0},
}
for sc in scores:
  score = sc[0]
  average_rouge['rouge_1']['f'] += score['rouge-1']['f']
  average_rouge['rouge_1']['p'] += score['rouge-1']['p']
  average_rouge['rouge_1']['r'] += score['rouge-1']['r']
  average_rouge['rouge_2']['f'] += score['rouge-2']['f']
  average_rouge['rouge_2']['p'] += score['rouge-2']['p']
  average_rouge['rouge_2']['r'] += score['rouge-2']['r']
  average_rouge['rouge_l']['f'] += score['rouge-l']['f']
  average_rouge['rouge_l']['p'] += score['rouge-l']['p']
  average_rouge['rouge_l']['r'] += score['rouge-l']['r']

average_rouge['rouge_1']['f'] /= len(scores)
average_rouge['rouge_1']['p'] /= len(scores)
average_rouge['rouge_1']['r'] /= len(scores)
average_rouge['rouge_2']['f'] /= len(scores)
average_rouge['rouge_2']['p'] /= len(scores)
average_rouge['rouge_2']['r'] /= len(scores)
average_rouge['rouge_l']['f'] /= len(scores)
average_rouge['rouge_l']['p'] /= len(scores)
average_rouge['rouge_l']['r'] /= len(scores)

In [105]:
print("for top 3 sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------")  

for top 3 sentences
rouge_1
f	0.147817491063985
p	0.1191565027549791
r	0.21764581086768142
--------------------------
rouge_2
f	0.032386031740500275
p	0.025221901049095097
r	0.048653199042364106
--------------------------
rouge_l
f	0.12486465817585539
p	0.10309968285602499
r	0.17325227575193802
--------------------------


In [117]:
print("for top 4 sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------")  

for top 4 sentences
rouge_1
f	0.23079145635431822
p	0.16910484132552217
r	0.407750460829676
--------------------------
rouge_2
f	0.07574577683375168
p	0.05437105405999294
r	0.1374281904099026
--------------------------
rouge_l
f	0.2050603948271745
p	0.15447521739850095
r	0.33267502866279164
--------------------------


In [96]:
print("for top 5 sentences")
for key,value in average_rouge.items():
  print(key)
  for metric, val in value.items():
    print(metric+"\t"+str(val)) 
  print("--------------------------")  

for top 5 sentences
rouge_1
f	0.22238089529088254
p	0.15288740384962163
r	0.45631344462363566
--------------------------
rouge_2
f	0.07475935402793302
p	0.05056098905490247
r	0.15767976167084463
--------------------------
rouge_l
f	0.2044236926545439
p	0.14525485867624255
r	0.3750272634696634
--------------------------
