# Summarizer Model with Numpy

In [36]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
import string
import pandas as pd
import numpy as np
from unidecode import unidecode

#download assets from nltk
# nltk.download('stopwords')
# nltk.download('punkt')

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of sentences (documents) that need to be summarized (m x n matrix)
    m = number of different terms used in the documents, n = number of documents (not 0)

    Returns
    - tfidf_vec: an m x n matrix of the corpus
    - vocab: all the unique words used in the corpus, excluding stop words

    https://nlp.stanford.edu/IR-book/html/htmledition/inverse-document-frequency-1.html
    '''
    # calculate term frequency matrix
    num_docs = len(corpus)
    stop_words = stopwords.words('english')
    word_sentence = []
    vocab = set()

    # sanitize text and break up each sentence into individual words
    for doc in corpus:
        #sanitize_text = doc.translate(str.maketrans('', '', string.punctuation))
        sanitize_text = doc
        tokenized = [word.lower() for word in word_tokenize(sanitize_text)]
        tokenized = [word for word in tokenized if word not in stop_words and word not in string.punctuation]
        word_sentence.append(tokenized)    
        vocab = vocab.union(set(tokenized))
    
    word_ind = {word : i for i, word in enumerate(vocab)}
    tf = np.zeros((len(vocab), num_docs))

    for i, words in enumerate(word_sentence):
        for word in words:
            tf[word_ind[word], i] += 1
    
    dft = np.sum(np.greater(tf, [0]).astype(float), axis=1)
    idf = np.log(np.divide([num_docs], dft))
    tfidf_vec= tf * np.expand_dims(idf, axis=1)

    return tfidf_vec, vocab

def svd(doc_term_matrix):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - doc_term_matrix: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - u: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - v_t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components = 10, n_iter=20)
    u = lsa.fit_transform(doc_term_matrix)
    sigma = lsa.singular_values_
    v_t = lsa.components_.T

    return u, sigma, v_t

def weigh_sentence_importance(u, sigma):
    '''
    Uses the LSA enhancement described by Josef Steinberg, et al. to weigh
    sentence importance from topics
    Takes all topics that have singular values > half of the largest singular value

    Compute s_k = sqrt(sum(v_ki^2 * sigma_i^2) from i = 1 to n) for all sentences
    s_k is the length of the vector of the kth sentence
    n is the number of topics 

    Args
    - U, sigma matrices from SVD

    Returns
    - Vector of each sentence weight as calculated above (1 x m)
    '''

    #look for the sigma value range that we need to consider using binary search
    #sigma array is sorted in descending order and will never be empty
    l, r, target = 0, len(sigma), sigma[0]/2
    while l < r:
        mid = l + (r-l)//2

        if sigma[mid] < target:
            r = mid
        else:
            l = mid + 1
    sigma_bound = l

    u_slice = u[:, :sigma_bound]
    sigma_slice = sigma[:sigma_bound]
    u_sq = np.square(u_slice)
    sig_sq = np.square(np.diag(sigma_slice))
    prod = np.matmul(u_sq, sig_sq)
    s = np.sqrt(np.sum(prod, axis = 1)).T

    return s

def get_important_sentences(u, sigma):
    '''
    Based on the sentence importance results, sort the indices to return indices that correspond to the
    most importance sentence to least important

    Args
    - U, sigma matrices from SVD

    Returns
    - Vector of sentence indices in descending order of weight (1 x m)
    '''

    return (-weigh_sentence_importance(u, sigma)).argsort()

def create_word_to_sentence_map(corpus):
    '''
    Creates a dictionary that maps a word from the vocab to all sentences with that word in the corpus.

    Args
    - corpus of sentences used in this summary

    Returns
    - the dictionary described
    '''
    
    word_to_sentence = {}
    stop_words = set(stopwords.words('english'))

    for i, doc in enumerate(corpus):
        #remove punctuation while preserving contractions in text
        sanitize_text = doc.translate(str.maketrans('', '', string.punctuation))
        tokenized = word_tokenize(sanitize_text)
        #remove duplicate words
        tokenized = list(set([word.lower() for word in tokenized]))

        for word in tokenized:
            if word not in stop_words:
                if word not in word_to_sentence:
                    word_to_sentence[word] = [i]
                else:
                    word_to_sentence[word].append(i)
    
    return word_to_sentence

def extract_summary(u, sigma, k, corpus):
    '''
    Helper method to get the text summary.

    Summary will be taken from the top k sentences from getImportantSentences()
    for each topic.

    Args
    - U, sigma from SVD
    - k: number of sentences to include in summary
    - corpus: the list of sentences

    Returns
    - the list of strings for the summary
    '''

    return [corpus[i] for i in get_important_sentences(u, sigma)[:k]]

In [2]:
def preprocess(block_text):
    '''
    Preprocesses the original text to be summarized by tokenizing the sentences and removing
    unnecessary characters.

    Args
    - block_text: text to be summarized

    Returns
    - list of sentences that can be used to create a summary
    '''

    tokenized = sent_tokenize(unidecode(block_text)) 
    return [token.replace('\n',' ') for token in tokenized]

In [35]:
text = '''
The list of businesses impacted by a lockdown beginning Monday in Toronto and Peel Region were not clearly communicated, the owner of a Toronto massage spa says.

While the Ontario government offered a partial list of what would remain open after the COVID-19 shutdown begins at 12:01 a.m., Kate Armstrong, owner and director of Bahn Thai Spa, told the Star she was unsure whether her business would be impacted.

The Ontario government’s late-afternoon announcement on Friday stated that personal services, such as nail and hair salons, would now be closed. Missing, however, were details of all services included in the shutdown.

However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”

A spokesperson said that “under lockdown, regulated health professionals, including massage therapists, will be able to operate. Regulated health professionals such as registered massage therapists were not impacted and therefore not referenced.”

Working “in partnership with the chief medical officer of health and our local medical officers of health, we continue to closely monitor the evolving situation to advise if and when public health measures need to be adjusted,” the spokesperson also said.

In Ontario’s first lockdown last spring, physiotherapy, chiropractic services and massage therapists were among those to close their doors, which left some confused about what is happening this time around.

“We have to continue to communicate with clients that are calling and saying, ‘Are we seeing you on Monday or not?’ We’re having to say we’ll call you as soon as we know something more,” Armstrong said.

“It’s not like a haircut,” she said, adding that people are often seeking massage to treat physical pain or for mental health care.

To Armstrong, massage has been as important as mental healthcare for Ontarians during the months-long pandemic. “I see the fatigue setting in on everyone’s faces ... The stress is so high … right now, (with) people not being able to be with their families. It’s so important to have human touch.”

The Ontario Physiotherapy Association shared the news that physiotherapy services would be able to continue operations, said Shafiq Bhanji, president of Athlete’s Care Sports Medicine Centres.

“We received direction from our respective colleges and professional associations on Friday and over the weekend via email indicating that our services would not be impacted the upcoming lockdown,” Bhanji said in an email to the Star.

While Bhanji was able to confirm that Athlete’s Care could continue offering services and communicate that to clients via email, patients are still reaching out to verify whether they can keep their appointments.

“It seems there was a fair bit of confusion in the general public about whether or not these services would be impacted,” Bhanji said. “... We are fortunate that our colleges and professional associations acted quickly to inform their members.”
'''
corpus = preprocess(text)

tfidf_vec, vocab = tfidf(corpus)
display(tfidf_vec)

21

array([1., 1., 1., 1., 1., 1., 2., 1., 1., 2., 1., 1., 1., 1., 5., 6., 1.,
       2., 2., 1., 1., 2., 5., 1., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1.,
       2., 5., 1., 1., 3., 1., 1., 1., 1., 1., 1., 4., 1., 2., 1., 1., 1.,
       3., 1., 1., 1., 2., 1., 1., 1., 1., 2., 1., 1., 4., 1., 1., 1., 1.,
       3., 9., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 1., 1., 9.,
       1., 1., 1., 1., 1., 2., 4., 1., 2., 1., 2., 1., 2., 1., 1., 1., 2.,
       1., 1., 1., 1., 1., 1., 1., 2., 1., 7., 3., 1., 2., 2., 1., 1., 1.,
       1., 1., 2., 1., 1., 1., 1., 3., 1., 1., 1., 1., 1., 8., 7., 3., 1.,
       1., 1., 2., 2., 1., 1., 1., 1., 1., 4., 1., 1., 2., 2., 1., 1., 1.,
       1., 2., 1., 1., 4., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 3., 1., 1., 5., 1., 1., 1., 1., 1., 1., 1.])

array([3.04452244, 3.04452244, 3.04452244, 3.04452244, 3.04452244,
       3.04452244, 2.35137526, 3.04452244, 3.04452244, 2.35137526,
       3.04452244, 3.04452244, 3.04452244, 3.04452244, 1.43508453,
       1.25276297, 3.04452244, 2.35137526, 2.35137526, 3.04452244,
       3.04452244, 2.35137526, 1.43508453, 3.04452244, 3.04452244,
       3.04452244, 3.04452244, 3.04452244, 3.04452244, 2.35137526,
       3.04452244, 3.04452244, 3.04452244, 3.04452244, 2.35137526,
       1.43508453, 3.04452244, 3.04452244, 1.94591015, 3.04452244,
       3.04452244, 3.04452244, 3.04452244, 3.04452244, 3.04452244,
       1.65822808, 3.04452244, 2.35137526, 3.04452244, 3.04452244,
       3.04452244, 1.94591015, 3.04452244, 3.04452244, 3.04452244,
       2.35137526, 3.04452244, 3.04452244, 3.04452244, 3.04452244,
       2.35137526, 3.04452244, 3.04452244, 1.65822808, 3.04452244,
       3.04452244, 3.04452244, 3.04452244, 1.94591015, 0.84729786,
       3.04452244, 2.35137526, 3.04452244, 3.04452244, 3.04452

array([[0.        , 3.04452244, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.04452244],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 3.04452244, 0.        ,
        0.        ]])