# Summarizer Model

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
import string
import pandas as pd
import numpy as np

#download assets from nltk
#nltk.download('stopwords')
#nltk.download('punkt')

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of sentences (documents) that need to be summarized (m x n matrix)
    m = number of different terms used in the documents, n = number of documents 

    Returns
    - tfidf_vec: an m x n matrix of the corpus
    - vocab: all the unique words used in the corpus, excluding stop words
    '''

    vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
    #vectorizer = CountVectorizer(stop_words='english')
    tfidf_vec = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
        
    return tfidf_vec, vocab

def svd(doc_term_matrix):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - doc_term_matrix: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - u: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - v_t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components = 10, n_iter=20)
    u = lsa.fit_transform(doc_term_matrix)
    sigma = lsa.singular_values_
    v_t = lsa.components_.T

    return u, sigma, v_t

def weigh_sentence_importance(u, sigma):
    '''
    Uses the LSA enhancement described by Josef Steinberg, et al. to weigh
    sentence importance from topics
    Takes all topics that have singular values > half of the largest singular value

    Compute s_k = sqrt(sum(v_ki^2 * sigma_i^2) from i = 1 to n) for all sentences
    s_k is the length of the vector of the kth sentence
    n is the number of topics 

    Args
    - U, sigma matrices from SVD

    Returns
    - Vector of each sentence weight as calculated above (1 x m)
    '''

    #look for the sigma value range that we need to consider using binary search
    #sigma array is sorted in descending order and will never be empty
    l, r, target = 0, len(sigma), sigma[0]/2
    while l < r:
        mid = l + (r-l)//2

        if sigma[mid] < target:
            r = mid
        else:
            l = mid + 1
    sigma_bound = l

    u_slice = u[:, :sigma_bound]
    sigma_slice = sigma[:sigma_bound]
    u_sq = np.square(u_slice)
    sig_sq = np.square(np.diag(sigma_slice))
    prod = np.matmul(u_sq, sig_sq)
    s = np.sqrt(np.sum(prod, axis = 1)).T

    return s

def get_important_sentences(u, sigma):
    '''
    Based on the sentence importance results, sort the indices to return indices that correspond to the
    most importance sentence to least important

    Args
    - U, sigma matrices from SVD

    Returns
    - Vector of sentence indices in descending order of weight (1 x m)
    '''

    return (-weigh_sentence_importance(u, sigma)).argsort()

def create_word_to_sentence_map(corpus):
    '''
    Creates a dictionary that maps a word from the vocab to all sentences with that word in the corpus.

    Args
    - corpus of sentences used in this summary

    Returns
    - the dictionary described
    '''
    
    word_to_sentence = {}
    stop_words = set(stopwords.words('english'))

    for i, doc in enumerate(corpus):
        #remove punctuation while preserving contractions in text
        sanitize_text = doc.translate(str.maketrans('', '', string.punctuation))
        tokenized = word_tokenize(sanitize_text)
        #remove duplicate words
        tokenized = list(set([word.lower() for word in tokenized]))

        for word in tokenized:
            if word not in stop_words:
                if word not in word_to_sentence:
                    word_to_sentence[word] = [i]
                else:
                    word_to_sentence[word].append(i)
    
    return word_to_sentence

def extract_summary(u, sigma, k, corpus):
    '''
    Helper method to get the text summary.

    Summary will be taken from the top k sentences from getImportantSentences()
    for each topic.

    Args
    - U, sigma from SVD
    - k: number of sentences to include in summary
    - corpus: the list of sentences

    Returns
    - the list of strings for the summary
    '''

    return [corpus[i] for i in get_important_sentences(u, sigma)[:k]]

In [3]:
def preprocess(block_text):
    '''
    Preprocesses the original text to be summarized by tokenizing the sentences and removing
    unnecessary characters.

    Args
    - block_text: text to be summarized

    Returns
    - list of sentences that can be used to create a summary
    '''

    tokenized = sent_tokenize(block_text) 
    return [token.replace('\n',' ') for token in tokenized]

In [18]:
def test_similarity(summary, vt_orig, sigma_orig):
    '''
    Tests similarity by looking at the term significance of the original text and summary.
    Uses cosine similarity to do this.

    Args
    - summary: a list of strings that make up the summary
    - vt_orig: the vt matrix from SVD of the original text (n x r)
    - sigma_orig: the sigma matrix from SVD of the original text (1 x n)

    Returns
    - cosine similarity
    '''

    summary_corpus, _ = tfidf(summary)
    u_summary, sigma_summary, vt_summary = svd(summary_corpus)
    s_summary = weigh_sentence_importance(vt_summary, sigma_summary)
    s_orig = weigh_sentence_importance(vt_orig, sigma_orig)

    # summary will always be shorter vector than the original so scale down original
    s_orig = s_orig[:s_summary.shape[0]]

    # normalize both vectors (both should have non-zero magnitude)
    s_summary_norm = s_summary / np.linalg.norm(s_summary)
    s_orig_norm = s_orig / np.linalg.norm(s_orig)

    # dot product 2 normalized vectors = cosine similarity
    return np.dot(s_summary_norm, s_orig_norm)

# Testing Model

In [4]:
text = u'''
The list of businesses impacted by a lockdown beginning Monday in Toronto and Peel Region were not clearly communicated, the owner of a Toronto massage spa says.

While the Ontario government offered a partial list of what would remain open after the COVID-19 shutdown begins at 12:01 a.m., Kate Armstrong, owner and director of Bahn Thai Spa, told the Star she was unsure whether her business would be impacted.

The Ontario government’s late-afternoon announcement on Friday stated that personal services, such as nail and hair salons, would now be closed. Missing, however, were details of all services included in the shutdown.

However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”

A spokesperson said that “under lockdown, regulated health professionals, including massage therapists, will be able to operate. Regulated health professionals such as registered massage therapists were not impacted and therefore not referenced.”

Working “in partnership with the chief medical officer of health and our local medical officers of health, we continue to closely monitor the evolving situation to advise if and when public health measures need to be adjusted,” the spokesperson also said.

In Ontario’s first lockdown last spring, physiotherapy, chiropractic services and massage therapists were among those to close their doors, which left some confused about what is happening this time around.

“We have to continue to communicate with clients that are calling and saying, ‘Are we seeing you on Monday or not?’ We’re having to say we’ll call you as soon as we know something more,” Armstrong said.

“It’s not like a haircut,” she said, adding that people are often seeking massage to treat physical pain or for mental health care.

To Armstrong, massage has been as important as mental healthcare for Ontarians during the months-long pandemic. “I see the fatigue setting in on everyone’s faces ... The stress is so high … right now, (with) people not being able to be with their families. It’s so important to have human touch.”

The Ontario Physiotherapy Association shared the news that physiotherapy services would be able to continue operations, said Shafiq Bhanji, president of Athlete’s Care Sports Medicine Centres.

“We received direction from our respective colleges and professional associations on Friday and over the weekend via email indicating that our services would not be impacted the upcoming lockdown,” Bhanji said in an email to the Star.

While Bhanji was able to confirm that Athlete’s Care could continue offering services and communicate that to clients via email, patients are still reaching out to verify whether they can keep their appointments.

“It seems there was a fair bit of confusion in the general public about whether or not these services would be impacted,” Bhanji said. “... We are fortunate that our colleges and professional associations acted quickly to inform their members.”
'''

corpus = preprocess(text)
display(corpus)
tfidf_vec, vocab = tfidf(corpus)
word_to_sentence = create_word_to_sentence_map(corpus)
print(vocab)
print(word_to_sentence)
print(tfidf_vec)
print('----------------------------------------------------------')

u, sigma, vt = svd(tfidf_vec)
num_topics = u.shape[1] + 1

df_SVD = pd.DataFrame(u, columns=[f'topic{str(i)}' for i in range(1, num_topics)])
doc_col = pd.DataFrame({'Documents': corpus})
df_SVD = pd.concat([doc_col, df_SVD], axis = 1)

display(df_SVD)
print('----------------------------------------------------------')
print(sigma)

print('----------------------------------------------------------')

df_vt = pd.DataFrame(vt, columns=[f'topic{str(i)}' for i in range(1, num_topics)])
vocab_col = pd.DataFrame({'Terms': vocab})
df_vt = pd.concat([vocab_col, df_vt], axis = 1)

display(df_vt)

for i in range(1, num_topics):
    df_vt_sorted = df_vt.sort_values(by=f'topic{i}', ascending=False)
    display(df_vt_sorted[['Terms', f'topic{i}']])
print('----------------------------------------------------------')


#df = pd.DataFrame()
#print(df)

[' The list of businesses impacted by a lockdown beginning Monday in Toronto and Peel Region were not clearly communicated, the owner of a Toronto massage spa says.',
 'While the Ontario government offered a partial list of what would remain open after the COVID-19 shutdown begins at 12:01 a.m., Kate Armstrong, owner and director of Bahn Thai Spa, told the Star she was unsure whether her business would be impacted.',
 'The Ontario government’s late-afternoon announcement on Friday stated that personal services, such as nail and hair salons, would now be closed.',
 'Missing, however, were details of all services included in the shutdown.',
 'However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”  A spokesperson said that “under lockdown, regulated health professionals, including massage th

['01', '12', '19', 'able', 'acted', 'adding', 'adjusted', 'advise', 'afternoon', 'also', 'among', 'announcement', 'appointments', 'armstrong', 'around', 'association', 'associations', 'athlete', 'bahn', 'beginning', 'begins', 'bhanji', 'bit', 'business', 'businesses', 'call', 'calling', 'care', 'centres', 'chief', 'chiropractic', 'clearly', 'clients', 'close', 'closed', 'closely', 'colleges', 'communicate', 'communicated', 'confirm', 'confirmed', 'confused', 'confusion', 'continue', 'could', 'covid', 'dentists', 'details', 'direction', 'director', 'doors', 'email', 'everyone', 'evolving', 'faces', 'fair', 'families', 'fatigue', 'first', 'fortunate', 'friday', 'general', 'government', 'hair', 'haircut', 'happening', 'health', 'healthcare', 'high', 'however', 'human', 'impacted', 'important', 'included', 'including', 'indicating', 'inform', 'kate', 'keep', 'know', 'last', 'late', 'left', 'like', 'list', 'local', 'lockdown', 'long', 'massage', 'measures', 'medical', 'medicine', 'members',

Unnamed: 0,Documents,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10
0,The list of businesses impacted by a lockdown...,0.212205,-0.05689006,-0.3586889,-0.2520224,0.4926658,-0.04060439,0.0112423,0.0,-0.4564118,0.2536252
1,While the Ontario government offered a partial...,0.3251265,-0.3045537,-0.4457429,-0.2780163,0.1583704,-0.06354932,0.2703197,3.959315e-14,0.00851148,-0.01706644
2,The Ontario government’s late-afternoon announ...,0.2498981,-0.28455,-0.3005991,-0.06441886,-0.2800185,0.1884545,0.132041,3.62259e-14,0.0925174,-0.3320697
3,"Missing, however, were details of all services...",0.1778134,-0.06339399,-0.241858,-0.01358887,-0.2859783,-0.1154145,0.1217411,-4.445495e-14,0.5442687,0.6684633
4,"However, on Sunday, the Ministry of Health con...",0.493334,0.5457239,-0.151673,0.2101868,-0.09260668,-0.1139804,-0.05998296,-1.290947e-14,-0.02324718,0.09459912
5,Regulated health professionals such as registe...,0.4245538,0.5480899,-0.147992,0.2085594,0.06538647,-0.2952794,-0.08397827,1.566248e-15,0.03125534,-0.1955789
6,"In Ontario’s first lockdown last spring, physi...",0.3080669,0.07774877,-0.2027386,-0.0965463,-0.2934623,0.3960914,-0.5022268,-7.587733e-14,-0.3051934,0.189672
7,“We have to continue to communicate with clien...,0.2297365,-0.07188601,0.3553376,-0.3225288,0.331336,-0.3115954,-0.242117,-4.728977e-14,0.06209495,0.1830396
8,"“It’s not like a haircut,” she said, adding th...",0.3455652,0.4270226,0.1673223,-0.0736158,0.1607353,0.2307038,0.2743913,4.065708e-14,0.1467646,-0.1413578
9,"To Armstrong, massage has been as important as...",0.1741465,0.1298045,0.0873588,-0.4016126,0.3514651,0.4545098,-0.07794496,-1.746067e-14,0.4568746,-0.1138566


----------------------------------------------------------
[1.34140476 1.17559305 1.07635552 1.07191523 1.04617646 1.02389161
 1.00825    1.         0.99087419 0.98976066]
----------------------------------------------------------


Unnamed: 0,Terms,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10
0,01,0.036643,-0.044690,-0.078024,-0.049069,0.029344,-0.012293,0.053926,7.834637e-15,0.001758,-0.003533
1,12,0.036643,-0.044690,-0.078024,-0.049069,0.029344,-0.012293,0.053926,7.963248e-15,0.001758,-0.003533
2,19,0.036643,-0.044690,-0.078024,-0.049069,0.029344,-0.012293,0.053926,8.031117e-15,0.001758,-0.003533
3,able,0.172775,0.077704,0.166173,-0.003857,-0.124969,0.034739,0.154254,1.921032e-14,-0.137313,0.109589
4,acted,0.025809,-0.059410,0.018332,0.201459,0.123673,0.124447,-0.021196,-8.913148e-15,0.034105,0.057151
...,...,...,...,...,...,...,...,...,...,...,...
178,via,0.110120,-0.102794,0.106428,0.077074,0.013866,-0.035120,-0.011916,-2.162421e-15,-0.013192,0.019352
179,weekend,0.065728,-0.072308,0.009260,0.095635,0.037229,0.019172,-0.001119,7.055010e-18,-0.001974,-0.003710
180,whether,0.141052,-0.123791,0.012931,-0.042098,-0.010943,-0.132909,0.070457,1.518787e-14,0.003789,-0.072353
181,working,0.038733,0.065103,-0.020970,0.029797,0.009807,-0.046237,-0.013561,2.936308e-16,0.005226,-0.032774


Unnamed: 0,Terms,topic1
66,health,2.718219e-01
150,services,2.334297e-01
141,said,2.173264e-01
182,would,2.095178e-01
21,bhanji,1.896066e-01
...,...,...
151,setting,6.810919e-17
146,see,6.810919e-17
52,everyone,6.810919e-17
54,faces,6.810919e-17


Unnamed: 0,Terms,topic2
66,health,0.428262
137,regulated,0.169830
129,professionals,0.169830
168,therapists,0.163780
88,massage,0.158429
...,...,...
60,friday,-0.115597
180,whether,-0.123791
21,bhanji,-0.150247
51,email,-0.165932


Unnamed: 0,Terms,topic3
37,communicate,0.176537
32,clients,0.176537
3,able,0.166173
43,continue,0.165356
27,care,0.161612
...,...,...
84,list,-0.137110
158,spa,-0.137110
115,owner,-0.137110
154,shutdown,-0.151285


Unnamed: 0,Terms,topic4
16,associations,0.259417
128,professional,0.259417
36,colleges,0.259417
59,fortunate,0.201459
76,inform,0.201459
...,...,...
67,healthcare,-0.127561
117,pandemic,-0.127561
93,mental,-0.128097
72,important,-0.150312


Unnamed: 0,Terms,topic5
172,toronto,0.229712
13,armstrong,0.183996
96,monday,0.177470
93,mental,0.140641
16,associations,0.140496
...,...,...
95,missing,-0.119196
3,able,-0.124969
109,ontario,-0.132532
125,physiotherapy,-0.184455


Unnamed: 0,Terms,topic6
93,mental,0.195562
72,important,0.178015
125,physiotherapy,0.165452
117,pandemic,0.158222
87,long,0.158222
...,...,...
130,public,-0.125957
37,communicate,-0.127636
32,clients,-0.127636
180,whether,-0.132909


Unnamed: 0,Terms,topic7
122,people,0.321587
165,stress,0.287657
68,high,0.287657
56,families,0.287657
140,right,0.287657
...,...,...
14,around,-0.128392
50,doors,-0.128392
33,close,-0.128392
82,left,-0.128392


Unnamed: 0,Terms,topic8
146,see,4.472136e-01
52,everyone,4.472136e-01
54,faces,4.472136e-01
151,setting,4.472136e-01
57,fatigue,4.472136e-01
...,...,...
69,however,-1.994656e-14
95,missing,-2.073740e-14
47,details,-2.073740e-14
73,included,-2.073740e-14


Unnamed: 0,Terms,topic9
47,details,0.252880
73,included,0.252880
95,missing,0.252880
154,shutdown,0.222345
69,however,0.217423
...,...,...
56,families,-0.154684
165,stress,-0.154684
68,high,-0.154684
140,right,-0.154684


Unnamed: 0,Terms,topic10
95,missing,0.311283
73,included,0.311283
47,details,0.311283
69,however,0.285622
154,shutdown,0.268722
...,...,...
42,confusion,-0.114720
61,general,-0.114720
149,seems,-0.114720
130,public,-0.128789


----------------------------------------------------------


In [21]:
sentences = extract_summary(u, sigma, 5, corpus)
summary = '\n\n'.join(sentences)
print('Summary Generated')
print('----------------------------------------------------------\n')
print(summary)


Summary Generated
----------------------------------------------------------

“I see the fatigue setting in on everyone’s faces ...

Missing, however, were details of all services included in the shutdown.

However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”  A spokesperson said that “under lockdown, regulated health professionals, including massage therapists, will be able to operate.

Regulated health professionals such as registered massage therapists were not impacted and therefore not referenced.”  Working “in partnership with the chief medical officer of health and our local medical officers of health, we continue to closely monitor the evolving situation to advise if and when public health measures need to be adjusted,” the spokesperson also said.

The stress is so high … right 

In [22]:
cosine_similarity = test_similarity(sentences, vt, sigma)
print(f'The cosine similarity, in terms of term significance, between the summary and the actual text is {cosine_similarity}')

The cosine similarity, in terms of term significance, between the summary and the actual text is 0.8164463562154632
