<a href="https://colab.research.google.com/github/sayarghoshroy/Summarization/blob/master/summarization_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pickle
import spacy
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neural_network import MLPRegressor as mlp

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# change to path to dataset
file_name = "/content/drive/My Drive/NLA project/fasttext.pkl"
stories = pickle.load(open(file_name, 'rb'))

In [0]:
# basic embeddings using averaged glove vectors
# using Spacy's large language model
def get_embedding(text):
    extract = embedder(text)
    total_sum = np.zeros(300)
    count = 0
    for token in extract:
        count += 1
        total_sum += np.asarray(token.vector)
    return total_sum / count

In [0]:
def get_document_embedding(data):
  total_sum = np.zeros(300)
  for vector in data["fasttext_emb"]:
    total_sum += np.asarray(vector)
  return total_sum/len(data["fasttext_emb"])  


In [0]:
# creating the inputs and expected outputs
X_train = []
y_train = []
count = 0
for data in stories:
    count += 1
    doc_emb = get_document_embedding(data)
    # use the function of choice to generate the document embedding

    index = 0
    for sentence in data['fasttext_emb']:
        sent_emb = sentence
        # use the function of choice to generate the sentence embedding

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 
        index += 1

        X_train.append(x)
        y_train.append(y)

    if count > 100:
        break

X_train = np.asmatrix(X_train)
y_train = np.asarray(y_train)

In [0]:
def train(X, y):
    model = mlp(hidden_layer_sizes = (1024, 2048, 1024, 512, 256), max_iter = 100)
    model.fit(X, y)
    return model

def get_values(X, model):
    return model.predict(X)

In [0]:
m = train(X_train, 1000 * y_train)

In [0]:
# Hyperparameter for similarity threshold
theta = 0.95

def similarity(A, B):
    similarity =  (A @ B.T) / (np.linalg.norm(A) * np.linalg.norm(B))
    return similarity

def get_top_5(X_doc, y):
    order = np.flip(np.argsort(y))
    sentence_set = []
    for sent_id in order:
        if sentence_set == []:
            sentence_set.append(order[0])
            continue

        consider = X_doc[sent_id, :]
        flag = 1
        for consider_id in sentence_set:
            if similarity(X_doc[consider_id, :], consider) > theta:
                flag = 0
                break

        if flag == 1:
            sentence_set.append(sent_id)
    return sentence_set[0: min(5, len(sentence_set))]

In [22]:
# evaluation
# testing out each document iteratively
# test set: document 950 onwards

doc_id = 980
doc_count = len(stories)

# set the number of documents for testing
limit = 990

while doc_id < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[doc_id]
    doc_emb = get_document_embedding(data)

    index = 0
    for sentence in data['fasttext_emb']:
        sent_emb = sentence

        x = np.concatenate((sent_emb, doc_emb))
        y = data['scores'][index] 

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, m)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    # Uncomment to view the test_loss on the sample  
    # print(loss)

    print("Document ID:", doc_id, ", Top 5 Sentences:", get_top_5(X_doc, sentence_predicted_scores))

    # Uncomment to view the top 10 sentences based on Gold Labels
    print("Top 10 sentences based on Gold Label", np.flip(np.argsort(y_doc))[0:10])
    doc_id += 1

Document ID: 980 , Top 5 Sentences: [11, 7, 4, 1, 5]
Top 10 sentences based on Gold Label [12 11  1  0  5  2 14  4  6  7]
Document ID: 981 , Top 5 Sentences: [49, 20, 14, 37, 9]
Top 10 sentences based on Gold Label [ 2 25 14 17 31 13 22 21 20 19]
Document ID: 982 , Top 5 Sentences: [3, 12, 17, 1, 2]
Top 10 sentences based on Gold Label [ 7  1 19  4 11  0 16 12 17 20]
Document ID: 983 , Top 5 Sentences: [9, 12, 10, 5, 7]
Top 10 sentences based on Gold Label [ 9  4 10  1  7  0 13 12 11  8]
Document ID: 984 , Top 5 Sentences: [50, 31, 38, 11, 51]
Top 10 sentences based on Gold Label [ 1 54 57  0 53  2 27 13  3 16]
Document ID: 985 , Top 5 Sentences: [4, 0, 6, 5, 12]
Top 10 sentences based on Gold Label [ 6  0 15 14 13 12 11 10  9  8]
Document ID: 986 , Top 5 Sentences: [0, 8, 15, 13, 14]
Top 10 sentences based on Gold Label [ 0  1  2  4 12  9  3  5 15 14]
Document ID: 987 , Top 5 Sentences: [34, 3, 37, 28, 9]
Top 10 sentences based on Gold Label [15  0  2 26  8  5 11 17 16 14]
Document ID

In [0]:
# ^_^ Thank You