In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
import string
import pandas as pd

#download assets from nltk
#nltk.download('stopwords')
#nltk.download('punkt')

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of documents

    Returns
    - tfidfVec: an m x n matrix of the corpus. m = number of documents, n = number of different terms used in the documents
    - vocab: all the unique words used in the corpus, excluding stop words
    '''

    vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
    #vectorizer = CountVectorizer(stop_words='english')
    tfidfVec = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
        
    return tfidfVec, vocab

def svd(tfidfVec):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - tfidfVec: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - u: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - V^t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components=10, n_iter=20)
    u = lsa.fit_transform(tfidfVec)
    sigma = lsa.singular_values_
    vt = lsa.components_.T

    return u, sigma, vt

def createWordToSentenceMap(corpus):
    wordToSentence = {}
    stopWords = set(stopwords.words('english'))

    for i, doc in enumerate(corpus):
        #remove punctuation while preserving contractions in text
        sanitizeText = doc.translate(str.maketrans('', '', string.punctuation))
        tokenized = word_tokenize(sanitizeText)
        #remove duplicate words
        tokenized = list(set([word.lower() for word in tokenized]))

        for word in tokenized:
            if word not in stopWords:
                if word not in wordToSentence:
                    wordToSentence[word] = [i]
                else:
                    wordToSentence[word].append(i)
    
    return wordToSentence

In [24]:
corpus = [
    'In this paper, we propose two generic text summarization methods that create text summaries by ranking and extracting sentences from the original documents.',
    'The first method uses standard IR methods to rank sentence relevances, while the second method uses the latent semantic analysis technique to identify semantically important sentences, for summary creations.',
    'Both methods strive to select sentences that are highly ranked and different from each other.'
]

tfidfVec, vocab = tfidf(corpus)
wordToSentence = createWordToSentenceMap(corpus)

print(wordToSentence)
print(tfidfVec)
print('----------------------------------------------------------')

svdVec, sigma, vt = svd(tfidfVec)
numTopics = svdVec.shape[1] + 1

df = pd.DataFrame(svdVec, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
docCol = pd.DataFrame({'Documents': corpus})
df = pd.concat([docCol, df], axis = 1)

display(df)
print('----------------------------------------------------------')
print(sigma)

print('----------------------------------------------------------')

dfVt = pd.DataFrame(vt, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
vocabCol = pd.DataFrame({'Terms': vocab})
dfVt = pd.concat([vocabCol, dfVt], axis = 1)

display(dfVt)

for i in range(1, numTopics):
    dfVtSort = dfVt.sort_values(by=f'topic{i}', ascending=False)
    display(dfVtSort[['Terms', f'topic{i}']])
print('----------------------------------------------------------')


#df = pd.DataFrame()
#print(df)

{'two': [0], 'methods': [0, 1, 2], 'sentences': [0, 1, 2], 'paper': [0], 'summarization': [0], 'ranking': [0], 'original': [0], 'extracting': [0], 'summaries': [0], 'generic': [0], 'text': [0], 'propose': [0], 'documents': [0], 'create': [0], 'identify': [1], 'second': [1], 'rank': [1], 'sentence': [1], 'semantically': [1], 'uses': [1], 'relevances': [1], 'summary': [1], 'semantic': [1], 'analysis': [1], 'creations': [1], 'first': [1], 'standard': [1], 'important': [1], 'latent': [1], 'ir': [1], 'method': [1], 'technique': [1], 'different': [2], 'select': [2], 'strive': [2], 'highly': [2], 'ranked': [2]}
  (0, 4)	0.2523960843862897
  (0, 15)	0.2523960843862897
  (0, 27)	0.14906919332483493
  (0, 5)	0.2523960843862897
  (0, 20)	0.2523960843862897
  (0, 30)	0.2523960843862897
  (0, 1)	0.2523960843862897
  (0, 14)	0.14906919332483493
  (0, 31)	0.2523960843862897
  (0, 34)	0.5047921687725794
  (0, 7)	0.2523960843862897
  (0, 35)	0.2523960843862897
  (0, 17)	0.2523960843862897
  (0, 16)	0.2

Unnamed: 0,Documents,topic1,topic2,topic3
0,"In this paper, we propose two generic text sum...",0.606851,-0.576719,-0.546925
1,The first method uses standard IR methods to r...,0.539111,0.787951,-0.297477
2,Both methods strive to select sentences that a...,0.674182,-0.110965,0.730182


----------------------------------------------------------
[1.05519232 0.98274323 0.95957538]
----------------------------------------------------------


Unnamed: 0,Terms,topic1,topic2,topic3
0,analysis,0.097429,0.164169,-0.065008
1,create,0.137563,-0.150718,-0.149917
2,creations,0.097429,0.164169,-0.065008
3,different,0.253668,-0.048135,0.332219
4,documents,0.137563,-0.150718,-0.149917
5,extracting,0.137563,-0.150718,-0.149917
6,first,0.097429,0.164169,-0.065008
7,generic,0.137563,-0.150718,-0.149917
8,highly,0.253668,-0.048135,0.332219
9,identify,0.097429,0.164169,-0.065008


Unnamed: 0,Terms,topic1
27,sentences,0.28861
14,methods,0.28861
34,text,0.275126
23,select,0.253668
3,different,0.253668
29,strive,0.253668
19,ranked,0.253668
8,highly,0.253668
13,method,0.194858
36,uses,0.194858


Unnamed: 0,Terms,topic2
36,uses,0.328338
13,method,0.328338
12,latent,0.164169
33,technique,0.164169
32,summary,0.164169
28,standard,0.164169
26,sentence,0.164169
25,semantically,0.164169
24,semantic,0.164169
22,second,0.164169


Unnamed: 0,Terms,topic3
23,select,0.332219
3,different,0.332219
19,ranked,0.332219
29,strive,0.332219
8,highly,0.332219
27,sentences,0.069275
14,methods,0.069275
18,rank,-0.065008
28,standard,-0.065008
24,semantic,-0.065008


----------------------------------------------------------
