In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of documents

    Returns
    - tfidfVec: an m x n matrix of the corpus. m = number of documents, n = number of different terms used in the documents
    - vocab: all the unique words used in the corpus, excluding stop words
    '''

    vectorizer = TfidfVectorizer(stop_words='english')
    #vectorizer = CountVectorizer(stop_words='english')
    tfidfVec = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
    return tfidfVec, vocab

def svd(tfidfVec):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - tfidfVec: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - u: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - V^t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components=10, n_iter=20)
    u = lsa.fit_transform(tfidfVec)
    sigma = lsa.singular_values_
    vt = lsa.components_.T

    return u, sigma, vt

In [26]:
corpus = [
    'In this paper, we propose two generic text summarization methods that create text summaries by ranking and extracting sentences from the original documents.',
    'The first method uses standard IR methods to rank sentence relevances, while the second method uses the latent semantic analysis technique to identify semantically important sentences, for summary creations.',
    'Both methods strive to select sentences that are highly ranked and different from each other.'
]

tfidfVec, vocab = tfidf(corpus)
print(tfidfVec)
print(len(vocab))
print('----------------------------------------------------------')

svdVec, sigma, vt = svd(tfidfVec)
numTopics = svdVec.shape[1] + 1

df = pd.DataFrame(svdVec, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
docCol = pd.DataFrame({'Documents': corpus})
df = pd.concat([docCol, df], axis = 1)

display(df)
print('----------------------------------------------------------')
print(sigma)

print('----------------------------------------------------------')

dfVt = pd.DataFrame(vt, columns=[f'topic{str(i)}' for i in range(1, numTopics)])
vocabCol = pd.DataFrame({'Terms': vocab})
dfVt = pd.concat([vocabCol, dfVt], axis = 1)

display(dfVt)

for i in range(1, numTopics):
    dfVtSort = dfVt.sort_values(by=f'topic{i}', ascending=False)
    display(dfVtSort[['Terms', f'topic{i}']])
print('----------------------------------------------------------')


#df = pd.DataFrame()
#print(df)

  (0, 4)	0.26084107346995766
  (0, 14)	0.26084107346995766
  (0, 26)	0.1540569399192421
  (0, 5)	0.26084107346995766
  (0, 19)	0.26084107346995766
  (0, 29)	0.26084107346995766
  (0, 1)	0.26084107346995766
  (0, 13)	0.1540569399192421
  (0, 30)	0.26084107346995766
  (0, 33)	0.5216821469399153
  (0, 6)	0.26084107346995766
  (0, 16)	0.26084107346995766
  (0, 15)	0.26084107346995766
  (1, 2)	0.20542217148685335
  (1, 31)	0.20542217148685335
  (1, 9)	0.20542217148685335
  (1, 24)	0.20542217148685335
  (1, 8)	0.20542217148685335
  (1, 32)	0.20542217148685335
  (1, 0)	0.20542217148685335
  (1, 23)	0.20542217148685335
  (1, 11)	0.20542217148685335
  (1, 21)	0.20542217148685335
  (1, 20)	0.20542217148685335
  (1, 25)	0.20542217148685335
  (1, 17)	0.20542217148685335
  (1, 10)	0.20542217148685335
  (1, 27)	0.20542217148685335
  (1, 34)	0.4108443429737067
  (1, 12)	0.4108443429737067
  (1, 26)	0.1213256436566357
  (1, 13)	0.1213256436566357
  (2, 3)	0.4189401020758947
  (2, 18)	0.418940102075894

Unnamed: 0,Documents,topic1,topic2,topic3
0,"In this paper, we propose two generic text sum...",0.610244,-0.566882,-0.553396
1,The first method uses standard IR methods to r...,0.539674,0.792453,-0.284201
2,Both methods strive to select sentences that a...,0.673459,-0.121359,0.729194


----------------------------------------------------------
[1.05697402 0.9818676  0.95851027]
----------------------------------------------------------


Unnamed: 0,Terms,topic1,topic2,topic3
0,analysis,0.099232,0.168855,-0.063545
1,create,0.142479,-0.153378,-0.157115
2,creations,0.099232,0.168855,-0.063545
3,different,0.252543,-0.052737,0.332508
4,documents,0.142479,-0.153378,-0.157115
5,extracting,0.142479,-0.153378,-0.157115
6,generic,0.142479,-0.153378,-0.157115
7,highly,0.252543,-0.052737,0.332508
8,identify,0.099232,0.168855,-0.063545
9,important,0.099232,0.168855,-0.063545


Unnamed: 0,Terms,topic1
26,sentences,0.291914
13,methods,0.291914
33,text,0.284958
3,different,0.252543
22,select,0.252543
28,strive,0.252543
18,ranked,0.252543
7,highly,0.252543
34,uses,0.198463
12,method,0.198463


Unnamed: 0,Terms,topic2
34,uses,0.337711
12,method,0.337711
11,latent,0.168855
32,technique,0.168855
31,summary,0.168855
27,standard,0.168855
25,sentence,0.168855
24,semantically,0.168855
23,semantic,0.168855
21,second,0.168855


Unnamed: 0,Terms,topic3
18,ranked,0.332508
3,different,0.332508
28,strive,0.332508
7,highly,0.332508
22,select,0.332508
26,sentences,0.066059
13,methods,0.066059
0,analysis,-0.063545
32,technique,-0.063545
31,summary,-0.063545


----------------------------------------------------------
