# Automated Document Summarization
## subsection of _Text Summarization and Topic Models_

* Automated Document Summarization
    1. Text Wrangling
    2. Text Representation with Feature Engineering
    3. Latent Semantic Analysis
    4. TextRank

In [None]:
# download document.txt and other packages

In [None]:
# preprocessing
import re

DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT)
DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
DOCUMENT = DOCUMENT.strip()

In [None]:
# implement document summarization using Gensim's summarization module
from gensim.summarization import summarize

print(summarize(DOCUMENT, ratio=0.2, split=False))

In [None]:
# limit summarization based on word count instead of proportions
print(summarize(DOCUMENT, word_count=75, split=False))

## Text Wrangling

In [None]:
import nltk
import numpy as np
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalized_corpus = np.vectorize(normalize_document)

# get sentences in the document
sentences = nltk.sent_tokenize(DOCUMENT)

# normalize each sentence in the document
norm_sentences = normalize_corpus(sentences)
norm_sentences[:3]

## Text Representation with Feature Engineering

In [None]:
# vectorize normalized sentences using TF-IDF feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray()

vocab = tv.get_feature_names()
td_matrix = dt_matrix.T
print(td_matrix.shape)
pd.DataFrame(np.round(td_matrix,2), index=vocab).head(10)

## Latent Semantic Analysis

In [None]:
# select number of sentences n that summary will contain
# perform low-rank SVD
num_sentences = 8
num_topics = 3

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)
print(u.shape, s.shape, vt.shape)
term_topic_mat, singular_values, topic_document_mat = u, s, vt

In [None]:
# remore singular values below threshold
sv_threshold = 0.5
min_sigma_value = max(singular_values) * sv_threshold
singular_values[singular_values < min_sigma_value] = 0

In [None]:
# compute sentence sailency scores for each sentence (document) in game description
salience_scores = np.sqrt(np.dot(np.square(singular_values), 
                                 np.square(topic_document_mat)))
salience_scores

In [None]:
# select top sentences based on saliency score
# display summary of game description
top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
top_sentence_indices.sort()
print('\n'.join(np.array(sentences)[top_sentence_indices]))

## TextRank

In [None]:
# reuse document-term feature matrix from LSA
# compute document similarity matrix
similarity_matrix = np.matmul(dt_matrix, dt_matrix.T)
print(similarity_matrix.shape)
np.round(similarity_matrix, 3)

In [None]:
# plot connected graph among all sentences from document
import networkx
# build similarity graph
similarity_graph = networkx.from_numpy_array(similarity_matrix)
similarity_graph

In [None]:
# view the similarity graph
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12,6))
networkx.draw_networkx(similarity_graph, node_color='lime')

In [None]:
# compute pagerank scores for all the sentences
scores = networkx.pagerank(similarity_graph)
ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)
ranked_sentences[:10]

In [None]:
# get the top sentence indices for our summary
top_sentence_indices = [ranked_sentences[index][1]
                           for index in range(num_sentences)]
top_sentence_indices.sort()

# construct the document summary
print('\n'.join(np.array(sentences)[top_sentence_indices]))