In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd

def tfidf(corpus):
    '''
    Computes the TF-IDF (term frequency - inverse document frequency) matrix

    Args
    - corpus: a list of documents

    Returns
    - tfidfVec: an m x n matrix of the corpus. m = number of documents, n = number of different terms used in the documents
    '''

    vectorizer = TfidfVectorizer(stop_words='english')
    tfidfVec = vectorizer.fit_transform(corpus)
    return tfidfVec

def svd(tfidfVec):
    '''
    Gives the singular value decomposition of an m x n matrix.
    A = U * sigma * V^t
    
    Args
    - tfidfVec: an m x n matrix. m = number of documents or sentences, n = number of terms

    Returns
    - u: an m x r matrix of left singular values (document-topic table). r = number of topics
    - sigma: an r x r diagonal matrix of singular values in decreasing order across the diagonal
    - V^t: an n x r matrix of right singular values (term-topic table)
    '''

    lsa = TruncatedSVD(n_components=10, n_iter=50)
    u = lsa.fit_transform(tfidfVec)
    sigma = lsa.singular_values_
    vt = lsa.components_.T

    return u, sigma, vt

In [17]:
corpus = [
    'In this paper, we propose two generic text summarization methods that create text summaries by ranking and extracting sentences from the original documents.',
    'The first method uses standard IR methods to rank sentence relevances, while the second method uses the latent semantic analysis technique to identify semantically important sentences, for summary creations.',
    'Both methods strive to select sentences that are highly ranked and different from each other.'
]

tfidfVec = tfidf(corpus)
print(tfidfVec)
print('----------------------------------------------------------')
svdVec, sigma, vt = svd(tfidfVec)

df = pd.DataFrame(svdVec, columns=[f'topic{str(i)}' for i in range(1, svdVec.shape[1]+1)])

display(df)
print('----------------------------------------------------------')
print(sigma)

print('----------------------------------------------------------')
print(vt.T)

print('----------------------------------------------------------')


#df = pd.DataFrame()
#print(df)

  (0, 4)	0.26084107346995766
  (0, 14)	0.26084107346995766
  (0, 26)	0.1540569399192421
  (0, 5)	0.26084107346995766
  (0, 19)	0.26084107346995766
  (0, 29)	0.26084107346995766
  (0, 1)	0.26084107346995766
  (0, 13)	0.1540569399192421
  (0, 30)	0.26084107346995766
  (0, 33)	0.5216821469399153
  (0, 6)	0.26084107346995766
  (0, 16)	0.26084107346995766
  (0, 15)	0.26084107346995766
  (1, 2)	0.20542217148685335
  (1, 31)	0.20542217148685335
  (1, 9)	0.20542217148685335
  (1, 24)	0.20542217148685335
  (1, 8)	0.20542217148685335
  (1, 32)	0.20542217148685335
  (1, 0)	0.20542217148685335
  (1, 23)	0.20542217148685335
  (1, 11)	0.20542217148685335
  (1, 21)	0.20542217148685335
  (1, 20)	0.20542217148685335
  (1, 25)	0.20542217148685335
  (1, 17)	0.20542217148685335
  (1, 10)	0.20542217148685335
  (1, 27)	0.20542217148685335
  (1, 34)	0.4108443429737067
  (1, 12)	0.4108443429737067
  (1, 26)	0.1213256436566357
  (1, 13)	0.1213256436566357
  (2, 3)	0.4189401020758947
  (2, 18)	0.418940102075894

Unnamed: 0,topic1,topic2,topic3
0,0.610244,-0.566882,-0.553396
1,0.539674,0.792453,-0.284201
2,0.673459,-0.121359,0.729194


----------------------------------------------------------
[1.05697402 0.9818676  0.95851027]
----------------------------------------------------------
[[ 0.09923171  0.16885535 -0.06354471]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.25254266 -0.05273725  0.33250769]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.25254266 -0.05273725  0.33250769]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.19846343  0.33771071 -0.12708942]
 [ 0.29191404 -0.02200616  0.06605902]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.25254266 -0.05273725  0.33250769]
 [ 0.14247906 -0.15337777 -0.15711534]
 [ 0.09923171  0.16885535 -0.06354471]
 [ 0.09923171  0.16885535 -0