## Latent Semantic Analysis / Indexing (LSA/LSI) Scikit-learn

Learning goals
* (A) Understand SVD using linear algebra
* (B) Apply Truncated SVD for LSI



In [1]:

#from google.colab import drive
#drive.mount('/content/gdrive')

Mounted at /content/gdrive


### (A) Understand SVD using linear algebra

In [1]:
import numpy as np
A = np.random.randint(low=1, high=50, size = (15,5))
A

array([[41,  2,  6,  8, 28],
       [22,  5, 45, 27,  4],
       [ 2, 18, 13, 26,  2],
       [23, 40,  5, 47,  2],
       [27,  7, 38,  4,  1],
       [35, 13, 14, 32,  2],
       [18, 30, 22, 49, 48],
       [11, 19,  2, 46, 22],
       [41, 33,  5,  6, 35],
       [11, 12, 23, 13, 45],
       [39, 14, 12, 43,  1],
       [14, 12, 40, 17, 47],
       [10, 18, 42, 41, 41],
       [42,  1, 14, 18, 29],
       [22, 28, 10, 43, 40]])

In [2]:
U, S, Vt = np.linalg.svd(A, full_matrices=True)
U.shape, S.shape, Vt.shape

((15, 15), (5,), (5, 5))

In [3]:
np.round(U,1)

array([[-0.2, -0.1, -0.3,  0.4,  0.2, -0.2, -0.2, -0.1, -0.3, -0.2, -0.3,
        -0.2, -0.1, -0.5, -0.2],
       [-0.2, -0. , -0.3, -0.5,  0. , -0.2,  0.1,  0. ,  0.6,  0.1, -0.2,
        -0.1, -0.2, -0.3,  0.2],
       [-0.1,  0.2,  0.1, -0.2, -0.2,  0.1, -0.4, -0. , -0.1, -0.3,  0.1,
        -0.5, -0.5,  0.2, -0.2],
       [-0.3,  0.5,  0.1,  0. , -0.4, -0.3, -0.1, -0.3, -0.1,  0.2, -0.4,
         0.3,  0.1,  0.1, -0.2],
       [-0.2, -0.1, -0.4, -0.3, -0.3, -0.1,  0.2,  0.4, -0.6,  0.1, -0. ,
        -0.1,  0.1,  0.2,  0.2],
       [-0.2,  0.3, -0.3, -0. ,  0.1,  0.9, -0. , -0. , -0.1,  0.1, -0.2,
         0.1,  0. , -0.1, -0. ],
       [-0.4, -0. ,  0.3,  0. ,  0. ,  0. ,  0.8, -0.1, -0.1, -0.1,  0. ,
        -0.1, -0.1, -0. , -0.2],
       [-0.2,  0.2,  0.3,  0.1,  0.3, -0.1, -0.1,  0.8,  0.1,  0. , -0.1,
         0.1, -0. , -0. , -0.1],
       [-0.3, -0.1, -0.2,  0.5, -0.6,  0.1,  0. ,  0.2,  0.4, -0.1,  0.2,
        -0.1,  0.1, -0. , -0. ],
       [-0.2, -0.4,  0.1,  0.1, -0. ,

In [4]:
np.round(S,1)

array([201.6,  72.6,  63.6,  59.4,  31. ])

In [5]:
np.round(Vt,1)

array([[-0.4, -0.3, -0.4, -0.6, -0.5],
       [ 0.1,  0.3, -0.4,  0.6, -0.7],
       [-0.8,  0.2, -0.3,  0.3,  0.4],
       [ 0.4,  0.2, -0.8, -0.2,  0.4],
       [ 0.1, -0.8, -0.2,  0.5,  0.2]])

In [6]:
U, S, Vt = np.linalg.svd(A, full_matrices=False)
U.shape, S.shape, Vt.shape

((15, 5), (5,), (5, 5))

In [7]:
np.round(U,1)

array([[-0.2, -0.1, -0.3,  0.4,  0.2],
       [-0.2, -0. , -0.3, -0.5,  0. ],
       [-0.1,  0.2,  0.1, -0.2, -0.2],
       [-0.3,  0.5,  0.1,  0. , -0.4],
       [-0.2, -0.1, -0.4, -0.3, -0.3],
       [-0.2,  0.3, -0.3, -0. ,  0.1],
       [-0.4, -0. ,  0.3,  0. ,  0. ],
       [-0.2,  0.2,  0.3,  0.1,  0.3],
       [-0.3, -0.1, -0.2,  0.5, -0.6],
       [-0.2, -0.4,  0.1,  0.1, -0. ],
       [-0.3,  0.4, -0.3, -0. ,  0.2],
       [-0.3, -0.4,  0.1, -0.2, -0.1],
       [-0.3, -0.2,  0.2, -0.3,  0.1],
       [-0.2, -0.1, -0.3,  0.2,  0.4],
       [-0.3,  0.1,  0.2,  0.2,  0.1]])

In [8]:
np.round(S,1)

array([201.6,  72.6,  63.6,  59.4,  31. ])

In [9]:
np.round(Vt,1)

array([[-0.4, -0.3, -0.4, -0.6, -0.5],
       [ 0.1,  0.3, -0.4,  0.6, -0.7],
       [-0.8,  0.2, -0.3,  0.3,  0.4],
       [ 0.4,  0.2, -0.8, -0.2,  0.4],
       [ 0.1, -0.8, -0.2,  0.5,  0.2]])

In [11]:
v_1 = [-1, 1]
v_2 = [1, 1]
np.dot(v_1, v_2)

0

### (B) TruncatedSVD

In [1]:
import numpy as np
A = np.random.randint(low=1, high=50, size=(5,10))
A

array([[33,  2, 11,  9, 40, 21, 12, 23, 26,  1],
       [17, 17,  1, 21, 24, 10, 11, 37, 37, 23],
       [16, 21, 17,  1, 21, 48, 17,  5, 32,  3],
       [12,  7, 33,  6, 22, 48, 47, 12, 49, 17],
       [49, 31,  1,  1, 23, 35, 42, 13,  8,  1]])

In [2]:
from sklearn.decomposition import TruncatedSVD

# Create SVD object
svd = TruncatedSVD(n_components=3, n_iter=10, random_state=2)
svd.fit(A)


In [3]:
np.round(svd.components_,2)

array([[ 0.35,  0.22,  0.2 ,  0.1 ,  0.36,  0.48,  0.39,  0.24,  0.44,
         0.13],
       [ 0.65,  0.3 , -0.4 , -0.02,  0.16, -0.16,  0.03,  0.13, -0.48,
        -0.2 ],
       [-0.02, -0.1 , -0.16,  0.34,  0.26, -0.44, -0.37,  0.55,  0.29,
         0.26]])

In [4]:
# Get Singular values and Components
Sigma = svd.singular_values_
np.round(Sigma,1)

array([157.1,  50.9,  50.8])

In [6]:
np.round(svd.explained_variance_ratio_,2)

array([0.09, 0.37, 0.36])

In [7]:
svd.explained_variance_ratio_.sum()

0.8204015857960876

In [8]:
from sklearn.utils.extmath import randomized_svd
U, Sigma, VT = randomized_svd(A,
                              n_components=3,
                              random_state=2)

In [9]:
A.shape, U.shape, Sigma.shape, VT.shape

((5, 10), (5, 3), (3,), (3, 10))

In [10]:
np.round(U,1)

array([[ 0.4,  0.2,  0.3],
       [ 0.4,  0. ,  0.8],
       [ 0.4, -0.2, -0.3],
       [ 0.6, -0.6, -0.2],
       [ 0.5,  0.7, -0.4]])

In [11]:
np.round(Sigma,1)

array([157.1,  50.9,  50.8])

In [12]:
np.round(VT,1)

array([[ 0.4,  0.2,  0.2,  0.1,  0.4,  0.5,  0.4,  0.2,  0.4,  0.1],
       [ 0.6,  0.3, -0.4, -0. ,  0.2, -0.2,  0. ,  0.1, -0.5, -0.2],
       [-0. , -0.1, -0.2,  0.3,  0.3, -0.4, -0.4,  0.5,  0.3,  0.3]])

### (C) Pre-processing

In [14]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
path = '../Dataset'
train = pd.read_csv(path + "/ag_news_train.csv")

In [16]:
train.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
1,3,Carlyle Looks Toward Commercial Aerospace (Reuters),"Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3,3,Iraq Halts Oil Exports from Main Southern Pipeline (Reuters),"Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
4,3,"Oil prices soar to all-time record, posing new menace to US economy (AFP)","AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections."


In [21]:
docs = train['Description'][0:5]
docs

0                                                                                                                            Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
1    Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
2                                  Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3                       Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.
4                                                          AFP - Tearaway world oil prices, toppling records and straini

In [15]:
# Class Index = 1 is World
train[train['Class Index']==1][0:2]

Unnamed: 0,Class Index,Title,Description
492,1,Venezuelans Vote Early in Referendum on Chavez Rule (Reuters),Reuters - Venezuelans turned out early\and in large numbers on Sunday to vote in a historic referendum\that will either remove left-wing President Hugo Chavez from\office or give him a new mandate to govern for the next two\years.
493,1,S.Koreans Clash with Police on Iraq Troop Dispatch (Reuters),"Reuters - South Korean police used water cannon in\central Seoul Sunday to disperse at least 7,000 protesters\urging the government to reverse a controversial decision to\send more troops to Iraq."


In [17]:
# Class Index = 2 is Sports
train[train['Class Index']==2][0:2]

Unnamed: 0,Class Index,Title,Description
448,2,"Phelps, Thorpe Advance in 200 Freestyle (AP)","AP - Michael Phelps took care of qualifying for the Olympic 200-meter freestyle semifinals Sunday, and then found out he had been added to the American team for the evening's 400 freestyle relay final. Phelps' rivals Ian Thorpe and Pieter van den Hoogenband and teammate Klete Keller were faster than the teenager in the 200 free preliminaries."
449,2,Reds Knock Padres Out of Wild-Card Lead (AP),"AP - Wily Mo Pena homered twice and drove in four runs, helping the Cincinnati Reds beat the San Diego Padres 11-5 on Saturday night. San Diego was knocked out of a share of the NL wild-card lead with the loss and Chicago's victory over Los Angeles earlier in the day."


In [18]:
# Class Index = 3 is Business
train[train['Class Index']==3][0:2]

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
1,3,Carlyle Looks Toward Commercial Aerospace (Reuters),"Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."


In [19]:
# Class Index = 4 is Sci/Tech.
train[train['Class Index']==4][0:2]

Unnamed: 0,Class Index,Title,Description
78,4,"'Madden,' 'ESPN' Football Score in Different Ways (Reuters)","Reuters - Was absenteeism a little high\on Tuesday among the guys at the office? EA Sports would like\to think it was because ""Madden NFL 2005"" came out that day,\and some fans of the football simulation are rabid enough to\take a sick day to play it."
79,4,Group to Propose New High-Speed Wireless Format (Reuters),"Reuters - A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation."


### Pre-processing

In [22]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [23]:
B = train['Description'][0:5]
B

0                                                                                                                            Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
1    Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
2                                  Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3                       Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.
4                                                          AFP - Tearaway world oil prices, toppling records and straini

In [24]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(lowercase=True,
                       stop_words='english',
                       ngram_range = (1,1),
                       tokenizer = tokenizer.tokenize)


# Fit and Transform the documents
train_data = tfidf.fit_transform(B)



In [25]:
len(tfidf.get_feature_names_out())

77

In [26]:
terms = tfidf.get_feature_names_out()
terms

array(['afp', 'authorities', 'band', 'barely', 'bets', 'carlyle',
       'controversial', 'crude', 'cynics', 'defense', 'depth', 'doldrums',
       'dwindling', 'earnings', 'economic', 'economy', 'elections',
       'expected', 'export', 'firm', 'flows', 'green', 'group', 'halted',
       'hang', 'industry', 'infrastructure', 'intelligence', 'investment',
       'iraq', 'main', 'making', 'market', 'menace', 'militia', 'months',
       'new', 'occasionally', 'official', 'oil', 'outlook', 'pipeline',
       'placed', 'plays', 'plus', 'present', 'presidential', 'prices',
       'private', 'quietly', 'rebel', 'records', 'reputation', 'reuters',
       's', 'said', 'saturday', 'seeing', 'sellers', 'short', 'showed',
       'soaring', 'southern', 'stock', 'straining', 'street', 'strike',
       'summer', 'tearaway', 'timed', 'toppling', 'ultra', 'wall',
       'wallets', 'week', 'world', 'worries'], dtype=object)

In [27]:
# Create SVD object
lsi = TruncatedSVD(n_components=3, n_iter=100, random_state=42)

# Fit SVD model on data
lsi.fit(train_data)


In [30]:
lsi2 = TruncatedSVD(n_components=3)

In [31]:
lsi2.fit_transform(train_data)

array([[ 0.29320447,  0.33087331,  0.85949648],
       [ 0.41874853,  0.56863882, -0.25174065],
       [ 0.54509984,  0.33769411, -0.34738801],
       [ 0.53032953, -0.44971486,  0.21719312],
       [ 0.52500024, -0.53468575, -0.1379332 ]])

In [28]:
# Get Singular values and Components
Sigma = lsi.singular_values_
Sigma

array([1.05609076, 1.01734796, 0.99447695])

In [32]:
Sigma2 = lsi2.singular_values_
Sigma2

array([1.05609076, 1.01734796, 0.99447695])

In [33]:
lsi.explained_variance_ratio_.sum()

0.5229249537209735

In [34]:
V_transpose = lsi.components_
V_transpose

array([[ 0.11658396,  0.10653371,  0.07814384,  0.11658396,  0.0911447 ,
         0.0911447 ,  0.0911447 ,  0.12366413,  0.07814384,  0.0911447 ,
         0.12366413,  0.12366413,  0.07814384,  0.12366413,  0.11658396,
         0.12366413,  0.11658396,  0.12366413,  0.10653371,  0.0911447 ,
         0.10653371,  0.07814384,  0.0911447 ,  0.10653371,  0.12366413,
         0.0911447 ,  0.10653371,  0.10653371,  0.0911447 ,  0.10653371,
         0.10653371,  0.0911447 ,  0.17330641,  0.11658396,  0.10653371,
         0.11658396,  0.11658396,  0.0911447 ,  0.10653371,  0.26596066,
         0.12366413,  0.10653371,  0.0911447 ,  0.0911447 ,  0.12366413,
         0.11658396,  0.11658396,  0.19383064,  0.0911447 ,  0.0911447 ,
         0.10653371,  0.11658396,  0.0911447 ,  0.22506366,  0.07814384,
         0.10653371,  0.10653371,  0.07814384,  0.07814384,  0.07814384,
         0.10653371,  0.12366413,  0.10653371,  0.12366413,  0.11658396,
         0.07814384,  0.10653371,  0.12366413,  0.1

In [35]:
V_transpose.shape

(3, 77)

In [44]:
from sklearn.utils.extmath import randomized_svd
U, Sigma, VT = randomized_svd(train_data,
                              n_components=3,
                              random_state=2)

In [37]:
train_data.shape, U.shape, Sigma.shape, VT.shape

((5, 77), (5, 3), (3,), (3, 77))

In [38]:
U.shape

(5, 3)

In [39]:
Sigma.shape

(3,)

In [40]:
VT.shape

(3, 77)

In [41]:
lsi.components_.shape

(3, 77)

In [42]:
U

array([[ 0.27763189,  0.32523121,  0.86426989],
       [ 0.39650809,  0.55894231, -0.25313875],
       [ 0.51614867,  0.3319357 , -0.3493173 ],
       [ 0.50216283, -0.44204626,  0.21839935],
       [ 0.49711659, -0.52556821, -0.13869924]])

In [43]:
VT

array([[ 0.11658396,  0.10653371,  0.07814384,  0.11658396,  0.0911447 ,
         0.0911447 ,  0.0911447 ,  0.12366413,  0.07814384,  0.0911447 ,
         0.12366413,  0.12366413,  0.07814384,  0.12366413,  0.11658396,
         0.12366413,  0.11658396,  0.12366413,  0.10653371,  0.0911447 ,
         0.10653371,  0.07814384,  0.0911447 ,  0.10653371,  0.12366413,
         0.0911447 ,  0.10653371,  0.10653371,  0.0911447 ,  0.10653371,
         0.10653371,  0.0911447 ,  0.17330641,  0.11658396,  0.10653371,
         0.11658396,  0.11658396,  0.0911447 ,  0.10653371,  0.26596066,
         0.12366413,  0.10653371,  0.0911447 ,  0.0911447 ,  0.12366413,
         0.11658396,  0.11658396,  0.19383064,  0.0911447 ,  0.0911447 ,
         0.10653371,  0.11658396,  0.0911447 ,  0.22506366,  0.07814384,
         0.10653371,  0.10653371,  0.07814384,  0.07814384,  0.07814384,
         0.10653371,  0.12366413,  0.10653371,  0.12366413,  0.11658396,
         0.07814384,  0.10653371,  0.12366413,  0.1

In [45]:
VT_names = pd.DataFrame(VT, columns=terms).round(2)
VT_names

Unnamed: 0,afp,authorities,band,barely,bets,carlyle,controversial,crude,cynics,defense,...,summer,tearaway,timed,toppling,ultra,wall,wallets,week,world,worries
0,0.12,0.11,0.08,0.12,0.09,0.09,0.09,0.12,0.08,0.09,...,0.12,0.12,0.09,0.12,0.08,0.08,0.12,0.12,0.12,0.12
1,-0.13,-0.1,0.1,-0.13,0.13,0.13,0.13,0.08,0.1,0.13,...,0.08,-0.13,0.13,-0.13,0.1,0.1,-0.13,0.08,-0.13,0.08
2,-0.03,0.05,0.26,-0.03,-0.06,-0.06,-0.06,-0.09,0.26,-0.06,...,-0.09,-0.03,-0.06,-0.03,0.26,0.26,-0.03,-0.09,-0.03,-0.09


In [46]:
VT_names[0:1].sort_values(by=0, axis=1, ascending=False)

Unnamed: 0,oil,reuters,prices,market,wallets,presidential,menace,months,new,world,...,cynics,street,seeing,green,s,ultra,wall,dwindling,band,short
0,0.27,0.23,0.19,0.17,0.12,0.12,0.12,0.12,0.12,0.12,...,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08,0.08


In [47]:
VT_names[1:2].sort_values(by=1, axis=1, ascending=False)

Unnamed: 0,market,industry,timed,private,reputation,plays,placed,occasionally,making,investment,...,records,straining,tearaway,presidential,menace,months,new,present,afp,oil
1,0.17,0.13,0.13,0.13,0.13,0.13,0.13,0.13,0.13,0.13,...,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.26


In [48]:
VT_names[2:3].sort_values(by=2, axis=1, ascending=False)

Unnamed: 0,seeing,cynics,s,sellers,short,dwindling,street,ultra,wall,band,...,earnings,stock,soaring,economy,outlook,hang,plus,worries,prices,market
2,0.26,0.26,0.26,0.26,0.26,0.26,0.26,0.26,0.26,0.26,...,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.1,-0.12


In [51]:
lsi.components_

array([[ 0.11658396,  0.10653371,  0.07814384,  0.11658396,  0.0911447 ,
         0.0911447 ,  0.0911447 ,  0.12366413,  0.07814384,  0.0911447 ,
         0.12366413,  0.12366413,  0.07814384,  0.12366413,  0.11658396,
         0.12366413,  0.11658396,  0.12366413,  0.10653371,  0.0911447 ,
         0.10653371,  0.07814384,  0.0911447 ,  0.10653371,  0.12366413,
         0.0911447 ,  0.10653371,  0.10653371,  0.0911447 ,  0.10653371,
         0.10653371,  0.0911447 ,  0.17330641,  0.11658396,  0.10653371,
         0.11658396,  0.11658396,  0.0911447 ,  0.10653371,  0.26596066,
         0.12366413,  0.10653371,  0.0911447 ,  0.0911447 ,  0.12366413,
         0.11658396,  0.11658396,  0.19383064,  0.0911447 ,  0.0911447 ,
         0.10653371,  0.11658396,  0.0911447 ,  0.22506366,  0.07814384,
         0.10653371,  0.10653371,  0.07814384,  0.07814384,  0.07814384,
         0.10653371,  0.12366413,  0.10653371,  0.12366413,  0.11658396,
         0.07814384,  0.10653371,  0.12366413,  0.1

In [52]:
lsi.explained_variance_ratio_.sum()

0.5229249537209735

In [53]:
lsi2.explained_variance_ratio_.sum()

0.5229249537209736

In [54]:
# Print the topics with their terms
for index, component in enumerate(lsi.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['oil', 'reuters', 'prices', 'market', 'crude']
Topic 1:  ['market', 'bets', 'carlyle', 'controversial', 'defense']
Topic 2:  ['cynics', 'dwindling', 'green', 's', 'seeing']
