In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
import string
import pandas as pd
import numpy as np

#download assets from nltk
#nltk.download('stopwords')
#nltk.download('punkt')

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of documents

    Returns
    - tfidfVec: an m x n matrix of the corpus. m = number of different terms used in the documents, n = number of documents 
    - vocab: all the unique words used in the corpus, excluding stop words
    '''

    #vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
    vectorizer = CountVectorizer(stop_words='english')
    tfidfVec = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
        
    return tfidfVec, vocab

def svd(tfidfVec):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - tfidfVec: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - U: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - V^t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components = 10, n_iter=20)
    u = lsa.fit_transform(tfidfVec)
    sigma = lsa.singular_values_
    vt = lsa.components_.T

    return u, sigma, vt

def getImportantSentences(u, sigma):
    '''
    Uses the LSA enhancement described by Josef Steinberg, et al.
    Take all topics that have singular values > half of the largest singular value

    Compute sk = sqrt(sum(v_ki^2 * sigma_i^2) from i = 1 to n)
    sk is the length of the vector of the kth sentence
    n is the number of topics 
    '''
    #look for the sigma value range that we need to consider using binary search
    #sigma array is sorted in descending order and will never be empty
    l, r, target = 0, len(sigma), sigma[0]/2
    while l < r:
        mid = l + (r-l)//2

        if sigma[mid] < target:
            r = mid
        else:
            l = mid + 1
    sigmaBound = l

    uSlice = u[:, :sigmaBound]
    sigmaSlice = sigma[:sigmaBound]
    uSq = np.square(uSlice)
    sigSq = np.square(np.diag(sigmaSlice))
    prod = np.matmul(uSq, sigSq)
    result = np.sqrt(np.sum(prod, axis = 1)).T

    return (-result).argsort()

def createWordToSentenceMap(corpus):
    wordToSentence = {}
    stopWords = set(stopwords.words('english'))

    for i, doc in enumerate(corpus):
        #remove punctuation while preserving contractions in text
        sanitizeText = doc.translate(str.maketrans('', '', string.punctuation))
        tokenized = word_tokenize(sanitizeText)
        #remove duplicate words
        tokenized = list(set([word.lower() for word in tokenized]))

        for word in tokenized:
            if word not in stopWords:
                if word not in wordToSentence:
                    wordToSentence[word] = [i]
                else:
                    wordToSentence[word].append(i)
    
    return wordToSentence

def extractSummary(u, sigma, k, corpus):
    '''
    Summary will be taken from the top k sentences from getImportantSentences()
    for each topic.
    '''
    return [corpus[i] for i in getImportantSentences(u, sigma)[:k]]


In [30]:
def preProcess(blockText):
    tokenized = sent_tokenize(blockText) 
    return [token.replace('\n',' ') for token in tokenized]

In [31]:
text = u'''
The list of businesses impacted by a lockdown beginning Monday in Toronto and Peel Region were not clearly communicated, the owner of a Toronto massage spa says.

While the Ontario government offered a partial list of what would remain open after the COVID-19 shutdown begins at 12:01 a.m., Kate Armstrong, owner and director of Bahn Thai Spa, told the Star she was unsure whether her business would be impacted.

The Ontario government’s late-afternoon announcement on Friday stated that personal services, such as nail and hair salons, would now be closed. Missing, however, were details of all services included in the shutdown.

However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”

A spokesperson said that “under lockdown, regulated health professionals, including massage therapists, will be able to operate. Regulated health professionals such as registered massage therapists were not impacted and therefore not referenced.”

Working “in partnership with the chief medical officer of health and our local medical officers of health, we continue to closely monitor the evolving situation to advise if and when public health measures need to be adjusted,” the spokesperson also said.

In Ontario’s first lockdown last spring, physiotherapy, chiropractic services and massage therapists were among those to close their doors, which left some confused about what is happening this time around.

“We have to continue to communicate with clients that are calling and saying, ‘Are we seeing you on Monday or not?’ We’re having to say we’ll call you as soon as we know something more,” Armstrong said.

“It’s not like a haircut,” she said, adding that people are often seeking massage to treat physical pain or for mental health care.

To Armstrong, massage has been as important as mental healthcare for Ontarians during the months-long pandemic. “I see the fatigue setting in on everyone’s faces ... The stress is so high … right now, (with) people not being able to be with their families. It’s so important to have human touch.”

The Ontario Physiotherapy Association shared the news that physiotherapy services would be able to continue operations, said Shafiq Bhanji, president of Athlete’s Care Sports Medicine Centres.

“We received direction from our respective colleges and professional associations on Friday and over the weekend via email indicating that our services would not be impacted the upcoming lockdown,” Bhanji said in an email to the Star.

While Bhanji was able to confirm that Athlete’s Care could continue offering services and communicate that to clients via email, patients are still reaching out to verify whether they can keep their appointments.

“It seems there was a fair bit of confusion in the general public about whether or not these services would be impacted,” Bhanji said. “... We are fortunate that our colleges and professional associations acted quickly to inform their members.”
'''

corpus = preProcess(text)
display(corpus)
tfidfVec, vocab = tfidf(corpus)
wordToSentence = createWordToSentenceMap(corpus)
print(vocab)
print(wordToSentence)
print(tfidfVec)
print('----------------------------------------------------------')

u, sigma, vt = svd(tfidfVec)
numTopics = u.shape[1] + 1

dfSVD = pd.DataFrame(u, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
docCol = pd.DataFrame({'Documents': corpus})
dfSVD = pd.concat([docCol, dfSVD], axis = 1)

display(dfSVD)
print('----------------------------------------------------------')
print(sigma)

print('----------------------------------------------------------')

dfVt = pd.DataFrame(vt, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
vocabCol = pd.DataFrame({'Terms': vocab})
dfVt = pd.concat([vocabCol, dfVt], axis = 1)

display(dfVt)

for i in range(1, numTopics):
    dfVtSort = dfVt.sort_values(by=f'topic{i}', ascending=False)
    display(dfVtSort[['Terms', f'topic{i}']])
print('----------------------------------------------------------')


#df = pd.DataFrame()
#print(df)

[' The list of businesses impacted by a lockdown beginning Monday in Toronto and Peel Region were not clearly communicated, the owner of a Toronto massage spa says.',
 'While the Ontario government offered a partial list of what would remain open after the COVID-19 shutdown begins at 12:01 a.m., Kate Armstrong, owner and director of Bahn Thai Spa, told the Star she was unsure whether her business would be impacted.',
 'The Ontario government’s late-afternoon announcement on Friday stated that personal services, such as nail and hair salons, would now be closed.',
 'Missing, however, were details of all services included in the shutdown.',
 'However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”  A spokesperson said that “under lockdown, regulated health professionals, including massage th

['01', '12', '19', 'able', 'acted', 'adding', 'adjusted', 'advise', 'afternoon', 'announcement', 'appointments', 'armstrong', 'association', 'associations', 'athlete', 'bahn', 'beginning', 'begins', 'bhanji', 'bit', 'business', 'businesses', 'calling', 'care', 'centres', 'chief', 'chiropractic', 'clearly', 'clients', 'close', 'closed', 'closely', 'colleges', 'communicate', 'communicated', 'confirm', 'confirmed', 'confused', 'confusion', 'continue', 'covid', 'dentists', 'details', 'direction', 'director', 'doors', 'email', 'evolving', 'faces', 'fair', 'families', 'fatigue', 'fortunate', 'friday', 'general', 'government', 'hair', 'haircut', 'happening', 'having', 'health', 'healthcare', 'high', 'human', 'impacted', 'important', 'included', 'including', 'indicating', 'inform', 'kate', 'know', 'late', 'left', 'like', 'list', 'll', 'local', 'lockdown', 'long', 'massage', 'measures', 'medical', 'medicine', 'members', 'mental', 'ministry', 'missing', 'monday', 'monitor', 'months', 'nail', 'ne

Unnamed: 0,Documents,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10
0,The list of businesses impacted by a lockdown...,0.5733549,0.8396878,1.675249,0.9779961,0.6480998,3.351836,-0.2314502,-0.1803172,-0.8540333,-0.7734972
1,While the Ontario government offered a partial...,0.418112,1.884095,3.571864,2.109817,-1.259232,-1.414688,0.4204671,-0.4004819,0.09520558,0.149129
2,The Ontario government’s late-afternoon announ...,0.2451232,1.041237,0.1966595,0.07512446,0.04942955,-0.7973867,-1.289564,2.378686,0.3045175,-1.614288
3,"Missing, however, were details of all services...",0.1654312,0.4533973,0.01429418,0.04687537,0.04012813,-0.2055919,-0.1677731,0.245434,-0.04758368,-0.09171092
4,"However, on Sunday, the Ministry of Health con...",6.068741,0.10838,-2.187905,2.620222,-0.1279322,-0.2063247,0.4429118,-0.134197,-0.2737936,-0.05990454
5,Regulated health professionals such as registe...,5.373233,-2.193546,2.210515,-2.441951,0.006425418,-0.2558828,-0.4330012,0.1085281,-0.3447424,0.0654171
6,"In Ontario’s first lockdown last spring, physi...",1.005183,1.226459,-0.3430399,0.3456052,-0.3179,0.8269286,-1.777434,1.303354,0.360481,1.688418
7,“We have to continue to communicate with clien...,0.432997,0.7554828,0.3575215,-0.7246776,-0.104379,0.5975294,2.552526,1.660226,0.1657622,1.295077
8,"“It’s not like a haircut,” she said, adding th...",1.280971,0.1328839,0.0131676,-0.3708873,-0.1904938,0.7271533,0.4299136,-0.4808293,2.842603,-0.9264928
9,"To Armstrong, massage has been as important as...",0.2770382,0.2568594,0.242728,0.003078552,-0.3491427,0.5982232,0.1811334,0.08735893,1.398263,0.3558566


----------------------------------------------------------
[8.50185931 5.73182996 5.20315806 5.02358551 4.39296778 4.07909479
 3.90983219 3.54351125 3.49529473 3.29637809]
----------------------------------------------------------


Unnamed: 0,Terms,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10
0,01,0.005784,0.057348,0.131935,0.083602,-0.065251,-0.085022,0.027505,-0.031894,0.007793,0.013724
1,12,0.005784,0.057348,0.131935,0.083602,-0.065251,-0.085022,0.027505,-0.031894,0.007793,0.013724
2,19,0.005784,0.057348,0.131935,0.083602,-0.065251,-0.085022,0.027505,-0.031894,0.007793,0.013724
3,able,0.195827,0.169287,-0.228551,0.082371,-0.114621,-0.021380,0.130064,-0.090008,-0.124838,-0.146238
4,acted,0.000619,0.008378,0.001913,-0.002492,0.046556,-0.015214,-0.007887,-0.031172,0.019633,0.045290
...,...,...,...,...,...,...,...,...,...,...,...
161,unsure,0.005784,0.057348,0.131935,0.083602,-0.065251,-0.085022,0.027505,-0.031894,0.007793,0.013724
162,upcoming,0.013272,0.069406,0.012165,-0.014315,0.175333,-0.043812,-0.019156,-0.047345,0.027599,0.043269
163,verify,0.009545,0.057824,-0.022467,-0.045602,0.014070,-0.011487,0.102210,0.034737,-0.067476,-0.109774
164,weekend,0.013272,0.069406,0.012165,-0.014315,0.175333,-0.043812,-0.019156,-0.047345,0.027599,0.043269


Unnamed: 0,Terms,topic1
60,health,5.669509e-01
155,therapists,2.561633e-01
120,professionals,2.422568e-01
128,regulated,2.422568e-01
132,said,2.194100e-01
...,...,...
52,fortunate,6.194217e-04
4,acted,6.194217e-04
140,setting,7.953368e-23
51,fatigue,7.953368e-23


Unnamed: 0,Terms,topic2
139,services,0.340848
18,bhanji,0.254725
116,physiotherapy,0.234614
100,ontario,0.225013
46,email,0.196635
...,...,...
31,closely,-0.066767
127,registered,-0.066767
165,working,-0.066767
82,medical,-0.133534


Unnamed: 0,Terms,topic3
64,impacted,0.297994
106,owner,0.193815
75,list,0.193815
146,spa,0.193815
82,medical,0.163301
...,...,...
26,chiropractic,-0.093486
139,services,-0.119753
102,operate,-0.161631
67,including,-0.161631


Unnamed: 0,Terms,topic4
67,including,0.207654
102,operate,0.207654
150,star,0.173114
78,lockdown,0.141960
155,therapists,0.124586
...,...,...
116,physiotherapy,-0.152355
132,said,-0.157258
18,bhanji,-0.166513
82,medical,-0.193526


Unnamed: 0,Terms,topic5
46,email,0.364737
32,colleges,0.221890
119,professional,0.221890
13,associations,0.221890
78,lockdown,0.185815
...,...,...
83,medicine,-0.106726
3,able,-0.114621
65,important,-0.124818
100,ontario,-0.185889


Unnamed: 0,Terms,topic6
158,toronto,0.402889
80,massage,0.303019
88,monday,0.237356
34,communicated,0.201444
112,peel,0.201444
...,...,...
53,friday,-0.091734
143,shutdown,-0.097378
46,email,-0.099110
55,government,-0.132945


Unnamed: 0,Terms,topic7
33,communicate,0.269186
28,clients,0.269186
11,armstrong,0.206330
39,continue,0.195390
22,calling,0.166976
...,...,...
156,time,-0.116273
78,lockdown,-0.121596
139,services,-0.151378
116,physiotherapy,-0.207214


Unnamed: 0,Terms,topic8
139,services,0.199143
8,afternoon,0.189439
9,announcement,0.189439
151,stated,0.189439
133,salons,0.189439
...,...,...
64,impacted,-0.088912
150,star,-0.089927
23,care,-0.089948
3,able,-0.090008


Unnamed: 0,Terms,topic9
85,mental,0.347126
80,massage,0.256099
113,people,0.247616
5,adding,0.232674
107,pain,0.232674
...,...,...
128,regulated,-0.073039
14,athlete,-0.094958
39,continue,-0.109608
3,able,-0.124838


Unnamed: 0,Terms,topic10
116,physiotherapy,0.183437
11,armstrong,0.165658
58,happening,0.155384
73,left,0.155384
37,confused,0.155384
...,...,...
30,closed,-0.148562
133,salons,-0.148562
72,late,-0.148562
9,announcement,-0.148562


----------------------------------------------------------


In [33]:
sentences = extractSummary(u, sigma, 5, corpus)
summary = '\n\n'.join(sentences)
print(summary)


However, on Sunday, the Ministry of Health confirmed to the Star that “regulated health professionals including dentists, optometrists, chiropractic services, ophthalmologists, physical and occupational therapists and podiatrists will be able to operate.”  A spokesperson said that “under lockdown, regulated health professionals, including massage therapists, will be able to operate.

Regulated health professionals such as registered massage therapists were not impacted and therefore not referenced.”  Working “in partnership with the chief medical officer of health and our local medical officers of health, we continue to closely monitor the evolving situation to advise if and when public health measures need to be adjusted,” the spokesperson also said.

It’s so important to have human touch.”  The Ontario Physiotherapy Association shared the news that physiotherapy services would be able to continue operations, said Shafiq Bhanji, president of Athlete’s Care Sports Medicine Centres.

Wh