In [52]:
import numpy as np
import pandas as pd

from nlpia.data.loaders import get_data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from nltk.tokenize.casual import casual_tokenize

In [2]:
tfidf = dict(list(zip('cat dog apple lion NYC love'.split(), 
                      np.random.rand(6))))
tfidf

{'cat': 0.8846339329925238,
 'dog': 0.6482642989845245,
 'apple': 0.8902145816736708,
 'lion': 0.687566454036034,
 'NYC': 0.9859433871722367,
 'love': 0.5587909625950283}

In [3]:
topic = {}
topic ['petness'] = (.3 * tfidf['cat'] +
                     .3 * tfidf['dog'] +
                     .0 * tfidf['apple'] + 
                     .0 * tfidf['lion'] + 
                     .2 * tfidf['NYC'] + 
                     .2 * tfidf['love'])

In [4]:
topic['animalness'] = (.1 * tfidf['cat'] +
                       .1 * tfidf['dog'] + 
                       .1 * tfidf['apple'] +
                       .5 * tfidf['lion'] + 
                       .1 * tfidf['NYC'] + 
                       .1 * tfidf['love'])

In [5]:
topic['cityness'] = (.0 * tfidf['cat'] +
                     .1 * tfidf['dog'] + 
                     .2 * tfidf['apple'] +
                     .1 * tfidf['lion'] + 
                     .5 * tfidf['NYC'] + 
                     .1 * tfidf['love'])

In [6]:
topic

{'petness': 0.7688163395465675,
 'animalness': 0.7405679433598155,
 'cityness': 0.8604767814824112}

In [7]:
# Flipping the relationship.

word_vector = {}
word_vector['cat']   = .3 * topic['petness'] + .1 * topic['animalness'] + .0 * topic['cityness']
word_vector['dog']   = .3 * topic['petness'] + .1 * topic['animalness'] + .1 * topic['cityness']
word_vector['apple'] = .0 * topic['petness'] + .1 * topic['animalness'] + .2 * topic['cityness']
word_vector['lion']  = .0 * topic['petness'] + .5 * topic['animalness'] + .1 * topic['cityness']
word_vector['NYC']   = .2 * topic['petness'] + .1 * topic['animalness'] + .5 * topic['cityness']
word_vector['love']  = .2 * topic['petness'] + .1 * topic['animalness'] + .1 * topic['cityness']

In [23]:
# This line helps the wide columns of SMS text within a Pandas DataFrame printout.
pd.options.display.width = 120

In [24]:
sms = get_data('sms-spam')
sms.shape

(4837, 2)

In [25]:
sms.head(3)

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [26]:
# Flag the spam message by appending an exclamation point "!" to their label.sms = pd
index = [f'sms{i}{"!" * j}' 
         for (i, j) in zip(range(len(sms)), sms.spam)]
index[:5]

['sms0', 'sms1', 'sms2!', 'sms3', 'sms4']

In [27]:
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(np.int64)

In [31]:
sms.shape, sms.spam.sum()

((4837, 2), 638)

In [30]:
sms.head(3)

Unnamed: 0,spam,text
sms0,0,"Go until jurong point, crazy.. Available only ..."
sms1,0,Ok lar... Joking wif u oni...
sms2!,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [36]:
tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray()
tfidf_docs.shape

(4837, 9232)

In [40]:
mask = sms.spam.astype(bool).values
spam_centroid = tfidf_docs[mask].mean(axis=0)
ham_centroid = tfidf_docs[~mask].mean(axis=0)

In [41]:
spam_centroid.round(2)

array([0.06, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

In [44]:
ham_centroid.round(2)

array([0.02, 0.01, 0.  , ..., 0.  , 0.  , 0.  ])

In [45]:
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)

In [46]:
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1, 1))

In [49]:
sms['lda_predict'] = (sms.lda_score > 0.5).astype(np.int64)
sms[['spam', 'lda_predict', 'lda_score']].round(2).head(6)

Unnamed: 0,spam,lda_predict,lda_score
sms0,0,0,0.23
sms1,0,0,0.18
sms2!,1,1,0.72
sms3,0,0,0.18
sms4,0,0,0.29
sms5!,1,1,0.55


In [51]:
num = (sms.spam - sms.lda_predict).abs().sum()
den = len(sms)
round(1 - num / den, 3)

0.977

In [53]:
confusion_matrix(sms['spam'], sms['lda_predict'])

array([[4135,   64],
       [  45,  593]])

In [61]:
from nlpia.book.examples.ch04_catdog_lsa_3x6x16 import word_topic_vectors
from nlpia.book.examples.ch04_catdog_lsa_sorted import lsa_models, prettify_tdm

In [58]:
word_topic_vectors.T.round(1)

Unnamed: 0,cat,dog,apple,lion,nyc,love
top0,-0.6,-0.4,0.5,-0.3,0.4,-0.1
top1,-0.1,-0.3,-0.4,-0.1,0.1,0.8
top2,-0.3,0.8,-0.1,-0.5,0.0,0.1


In [62]:
bow_svd, tfidf_svd = lsa_models()
prettify_tdm(**bow_svd)

100%|██████████| 263/263 [00:00<00:00, 69246.83it/s]


Unnamed: 0,cat,dog,apple,lion,nyc,love,text
0,,,1.0,,1.0,,NYC is the Big Apple.
1,,,1.0,,1.0,,NYC is known as the Big Apple.
2,,,,,1.0,1.0,I love NYC!
3,,,1.0,,1.0,,I wore a hat to the Big Apple party in NYC.
4,,,1.0,,1.0,,Come to NYC. See the Big Apple!
5,,,1.0,,,,Manhattan is called the Big Apple.
6,1.0,,,,,,New York is a big city for a small cat.
7,1.0,,,1.0,,,"The lion, a big cat, is the king of the jungle."
8,1.0,,,,,1.0,I love my pet cat.
9,,,,,1.0,1.0,I love New York City (NYC).


In [64]:
tdm = bow_svd['tdm']
tdm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
cat,0,0,0,0,0,0,1,1,1,0,1
dog,0,0,0,0,0,0,0,0,0,0,1
apple,1,1,0,1,1,1,0,0,0,0,0
lion,0,0,0,0,0,0,0,1,0,0,0
nyc,1,1,1,1,1,0,0,0,0,1,0
love,0,0,1,0,0,0,0,0,1,1,0


In [73]:
U, s, Vt = np.linalg.svd(tdm)

# Left singular vectors, U.
pd.DataFrame(U, index=tdm.index).round(2)

Unnamed: 0,0,1,2,3,4,5
cat,-0.04,0.83,-0.38,-0.0,0.11,-0.38
dog,-0.0,0.21,-0.18,-0.71,-0.39,0.52
apple,-0.62,-0.21,-0.51,0.0,0.49,0.27
lion,-0.0,0.21,-0.18,0.71,-0.39,0.52
nyc,-0.75,0.0,0.24,-0.0,-0.52,-0.32
love,-0.22,0.42,0.69,0.0,0.41,0.37


In [74]:
# Singular values, s. Represents the singular values in a square diagonal matrix.
s.round(1)

array([3.1, 2.2, 1.8, 1. , 0.8, 0.5])

In [75]:
S = np.zeros((len(U), len(Vt)))
pd.np.fill_diagonal(S, s)
pd.DataFrame(S).round(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,3.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [76]:
# Right singular vector, Vt.
pd.DataFrame(Vt).round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.44,-0.44,-0.31,-0.44,-0.44,-0.2,-0.01,-0.01,-0.08,-0.31,-0.01
1,-0.09,-0.09,0.19,-0.09,-0.09,-0.09,0.37,0.47,0.56,0.19,0.47
2,-0.16,-0.16,0.52,-0.16,-0.16,-0.29,-0.22,-0.32,0.17,0.52,-0.32
3,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.71,-0.0,-0.0,-0.71
4,-0.04,-0.04,-0.14,-0.04,-0.04,0.58,0.13,-0.33,0.62,-0.14,-0.33
5,-0.09,-0.09,0.1,-0.09,-0.09,0.51,-0.73,0.27,-0.01,0.1,0.27
6,-0.57,0.21,0.11,0.33,-0.31,0.34,0.34,0.0,-0.34,0.23,0.0
7,-0.32,0.47,0.25,-0.63,0.41,0.07,0.07,0.0,-0.07,-0.18,0.0
8,-0.5,0.29,-0.2,0.41,0.16,-0.37,-0.37,-0.0,0.37,-0.17,0.0
9,-0.15,-0.15,-0.59,-0.15,0.42,0.04,0.04,-0.0,-0.04,0.63,-0.0


In [77]:
# Term-document matrix reconstruction error.
err = []

for numdim in range(len(s), 0, -1):
    S[numdim-1, numdim-1] = 0
    reconstructed_tdm = U.dot(S).dot(Vt)
    err.append(np.sqrt(((reconstructed_tdm - tdm).values.flatten() ** 2).sum() / np.product(tdm.shape)))
np.array(err).round(2)

array([0.06, 0.12, 0.17, 0.28, 0.39, 0.55])