In [91]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt

from itertools import product

from nlpia.data.loaders import get_data

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation as LDiA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

from nltk.tokenize.casual import casual_tokenize

In [3]:
tfidf = dict(list(zip('cat dog apple lion NYC love'.split(), 
                      np.random.rand(6))))
tfidf

{'cat': 0.15760432404614122,
 'dog': 0.37772600170293946,
 'apple': 0.4623306803036108,
 'lion': 0.3473196277605357,
 'NYC': 0.8137353456799057,
 'love': 0.8895478972774691}

In [4]:
topic = {}
topic ['petness'] = (.3 * tfidf['cat'] +
                     .3 * tfidf['dog'] +
                     .0 * tfidf['apple'] + 
                     .0 * tfidf['lion'] + 
                     .2 * tfidf['NYC'] + 
                     .2 * tfidf['love'])

In [5]:
topic['animalness'] = (.1 * tfidf['cat'] +
                       .1 * tfidf['dog'] + 
                       .1 * tfidf['apple'] +
                       .5 * tfidf['lion'] + 
                       .1 * tfidf['NYC'] + 
                       .1 * tfidf['love'])

In [6]:
topic['cityness'] = (.0 * tfidf['cat'] +
                     .1 * tfidf['dog'] + 
                     .2 * tfidf['apple'] +
                     .1 * tfidf['lion'] + 
                     .5 * tfidf['NYC'] + 
                     .1 * tfidf['love'])

In [7]:
topic

{'petness': 0.5012557463161992,
 'animalness': 0.44375423878127446,
 'cityness': 0.6607931615747695}

In [8]:
# Flipping the relationship.

word_vector = {}
word_vector['cat']   = .3 * topic['petness'] + .1 * topic['animalness'] + .0 * topic['cityness']
word_vector['dog']   = .3 * topic['petness'] + .1 * topic['animalness'] + .1 * topic['cityness']
word_vector['apple'] = .0 * topic['petness'] + .1 * topic['animalness'] + .2 * topic['cityness']
word_vector['lion']  = .0 * topic['petness'] + .5 * topic['animalness'] + .1 * topic['cityness']
word_vector['NYC']   = .2 * topic['petness'] + .1 * topic['animalness'] + .5 * topic['cityness']
word_vector['love']  = .2 * topic['petness'] + .1 * topic['animalness'] + .1 * topic['cityness']

In [9]:
# This line helps the wide columns of SMS text within a Pandas DataFrame printout.
pd.options.display.width = 120

In [10]:
sms = get_data('sms-spam')
sms.shape

(4837, 2)

In [11]:
sms.head(3)

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [12]:
# Flag the spam message by appending an exclamation point "!" to their label.sms = pd
index = [f'sms{i}{"!" * j}' 
         for (i, j) in zip(range(len(sms)), sms.spam)]
index[:5]

['sms0', 'sms1', 'sms2!', 'sms3', 'sms4']

In [13]:
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(np.int64)

In [14]:
sms.shape, sms.spam.sum()

((4837, 2), 638)

In [15]:
sms.head(3)

Unnamed: 0,spam,text
sms0,0,"Go until jurong point, crazy.. Available only ..."
sms1,0,Ok lar... Joking wif u oni...
sms2!,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [16]:
tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray()
tfidf_docs.shape

(4837, 9232)

In [17]:
mask = sms.spam.astype(bool).values
spam_centroid = tfidf_docs[mask].mean(axis=0)
ham_centroid = tfidf_docs[~mask].mean(axis=0)

In [18]:
spam_centroid.round(2)

array([0.06, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

In [19]:
ham_centroid.round(2)

array([0.02, 0.01, 0.  , ..., 0.  , 0.  , 0.  ])

In [20]:
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)

In [21]:
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1, 1))

In [22]:
sms['lda_predict'] = (sms.lda_score > 0.5).astype(np.int64)
sms[['spam', 'lda_predict', 'lda_score']].round(2).head(6)

Unnamed: 0,spam,lda_predict,lda_score
sms0,0,0,0.23
sms1,0,0,0.18
sms2!,1,1,0.72
sms3,0,0,0.18
sms4,0,0,0.29
sms5!,1,1,0.55


In [23]:
num = (sms.spam - sms.lda_predict).abs().sum()
den = len(sms)
round(1 - num / den, 3)

0.977

In [24]:
confusion_matrix(sms['spam'], sms['lda_predict'])

array([[4135,   64],
       [  45,  593]])

In [25]:
from nlpia.book.examples.ch04_catdog_lsa_3x6x16 import word_topic_vectors
from nlpia.book.examples.ch04_catdog_lsa_sorted import lsa_models, prettify_tdm

100%|██████████| 263/263 [00:00<00:00, 89002.90it/s]
100%|██████████| 263/263 [00:00<00:00, 132298.15it/s]


In [26]:
word_topic_vectors.T.round(1)

Unnamed: 0,cat,dog,apple,lion,nyc,love
top0,-0.6,-0.4,0.5,-0.3,0.4,-0.1
top1,-0.1,-0.3,-0.4,-0.1,0.1,0.8
top2,-0.3,0.8,-0.1,-0.5,0.0,0.1


In [27]:
bow_svd, tfidf_svd = lsa_models()
prettify_tdm(**bow_svd)

100%|██████████| 263/263 [00:00<00:00, 91188.06it/s]


Unnamed: 0,cat,dog,apple,lion,nyc,love,text
0,,,1.0,,1.0,,NYC is the Big Apple.
1,,,1.0,,1.0,,NYC is known as the Big Apple.
2,,,,,1.0,1.0,I love NYC!
3,,,1.0,,1.0,,I wore a hat to the Big Apple party in NYC.
4,,,1.0,,1.0,,Come to NYC. See the Big Apple!
5,,,1.0,,,,Manhattan is called the Big Apple.
6,1.0,,,,,,New York is a big city for a small cat.
7,1.0,,,1.0,,,"The lion, a big cat, is the king of the jungle."
8,1.0,,,,,1.0,I love my pet cat.
9,,,,,1.0,1.0,I love New York City (NYC).


In [28]:
tdm = bow_svd['tdm']
tdm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
cat,0,0,0,0,0,0,1,1,1,0,1
dog,0,0,0,0,0,0,0,0,0,0,1
apple,1,1,0,1,1,1,0,0,0,0,0
lion,0,0,0,0,0,0,0,1,0,0,0
nyc,1,1,1,1,1,0,0,0,0,1,0
love,0,0,1,0,0,0,0,0,1,1,0


In [29]:
U, s, Vt = np.linalg.svd(tdm)

# Left singular vectors, U.
pd.DataFrame(U, index=tdm.index).round(2)

Unnamed: 0,0,1,2,3,4,5
cat,-0.04,0.83,-0.38,-0.0,0.11,-0.38
dog,-0.0,0.21,-0.18,-0.71,-0.39,0.52
apple,-0.62,-0.21,-0.51,0.0,0.49,0.27
lion,-0.0,0.21,-0.18,0.71,-0.39,0.52
nyc,-0.75,0.0,0.24,-0.0,-0.52,-0.32
love,-0.22,0.42,0.69,0.0,0.41,0.37


In [30]:
# Singular values, s. Represents the singular values in a square diagonal matrix.
s.round(1)

array([3.1, 2.2, 1.8, 1. , 0.8, 0.5])

In [31]:
S = np.zeros((len(U), len(Vt)))
pd.np.fill_diagonal(S, s)
pd.DataFrame(S).round(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,3.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [32]:
# Right singular vector, Vt.
pd.DataFrame(Vt).round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.44,-0.44,-0.31,-0.44,-0.44,-0.2,-0.01,-0.01,-0.08,-0.31,-0.01
1,-0.09,-0.09,0.19,-0.09,-0.09,-0.09,0.37,0.47,0.56,0.19,0.47
2,-0.16,-0.16,0.52,-0.16,-0.16,-0.29,-0.22,-0.32,0.17,0.52,-0.32
3,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.71,-0.0,-0.0,-0.71
4,-0.04,-0.04,-0.14,-0.04,-0.04,0.58,0.13,-0.33,0.62,-0.14,-0.33
5,-0.09,-0.09,0.1,-0.09,-0.09,0.51,-0.73,0.27,-0.01,0.1,0.27
6,-0.57,0.21,0.11,0.33,-0.31,0.34,0.34,0.0,-0.34,0.23,0.0
7,-0.32,0.47,0.25,-0.63,0.41,0.07,0.07,0.0,-0.07,-0.18,0.0
8,-0.5,0.29,-0.2,0.41,0.16,-0.37,-0.37,-0.0,0.37,-0.17,0.0
9,-0.15,-0.15,-0.59,-0.15,0.42,0.04,0.04,-0.0,-0.04,0.63,-0.0


In [33]:
# Term-document matrix reconstruction error.
err = []

for numdim in range(len(s), 0, -1):
    S[numdim-1, numdim-1] = 0
    reconstructed_tdm = U.dot(S).dot(Vt)
    err.append(np.sqrt(((reconstructed_tdm - tdm).values.flatten() ** 2).sum() / np.product(tdm.shape)))

X = list(range(len(s), 0, -1))
y = np.array(err).round(2)

In [34]:
plt.plot(X, y)

[<matplotlib.lines.Line2D at 0x11b8d0be0>]

## Principal Component Analysis

In [35]:
pd.set_option('display.max_columns', 6)

In [36]:
sms = get_data('sms-spam')
index = [f'sms{i}{"!" * j}'
         for (i, j) in zip(range(len(sms)), sms.spam)]
sms.index = index
sms.head(6)

Unnamed: 0,spam,text
sms0,0,"Go until jurong point, crazy.. Available only in bu..."
sms1,0,Ok lar... Joking wif u oni...
sms2!,1,Free entry in 2 a wkly comp to win FA Cup final tkt...
sms3,0,U dun say so early hor... U c already then say...
sms4,0,"Nah I don't think he goes to usf, he lives around h..."
sms5!,1,FreeMsg Hey there darling it's been 3 week's now an...


In [37]:
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)

9232

In [38]:
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

(4837, 9232)

In [39]:
sms.spam.sum()

638

## Using PCA for SMS message semantic analysis.

In [40]:
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, 
                                 columns=columns,
                                 index=index)
pca_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,...,topic13,topic14,topic15
sms0,0.201,0.003,0.037,...,-0.037,-0.014,0.042
sms1,0.404,-0.094,-0.078,...,-0.02,0.047,-0.054
sms2!,-0.03,-0.048,0.09,...,-0.023,-0.044,0.048
sms3,0.329,-0.033,-0.035,...,-0.044,0.022,-0.071
sms4,0.002,0.031,0.038,...,0.036,-0.081,-0.016
sms5!,-0.016,0.059,0.014,...,0.074,0.005,0.034


In [41]:
terms, column_nums = zip(*sorted(tfidf.vocabulary_.items()))
column_nums[:5], terms[:5]

((0, 1, 2, 3, 4), ('!', '"', '#', '#150', '#5000'))

In [42]:
weights = pd.DataFrame(pca.components_, 
                       columns=terms,
                       index=['topic{}'.format(i) for i in range(16)])

In [43]:
pd.options.display.max_columns = 8
weights.head(4).round(3)

Unnamed: 0,!,"""",#,#150,...,…,┾,〨ud,鈥
topic0,-0.071,0.008,-0.001,-0.0,...,-0.002,0.001,0.001,0.001
topic1,0.063,0.008,0.0,-0.0,...,0.003,0.001,0.001,0.001
topic2,0.071,0.027,0.0,0.001,...,0.002,-0.001,-0.001,-0.001
topic3,-0.059,-0.032,-0.001,-0.0,...,0.001,0.001,0.001,0.001


In [44]:
pd.options.display.max_columns = 12
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split(' ')].round(3) * 100
deals.head(10)

Unnamed: 0,!,;),:),half,off,free,crazy,deal,only,$,80,%
topic0,-7.1,0.1,-0.5,-0.0,-0.4,-2.0,-0.0,-0.1,-2.2,0.3,-0.0,-0.0
topic1,6.3,0.0,7.4,0.1,0.4,-2.3,-0.2,-0.1,-3.8,-0.1,-0.0,-0.2
topic2,7.1,0.2,-0.1,0.0,0.3,4.4,0.1,-0.1,0.7,0.0,0.0,0.1
topic3,-5.9,-0.3,-7.1,0.2,0.3,-0.2,0.0,0.1,-2.3,0.1,-0.1,-0.3
topic4,38.1,-0.1,-12.4,-0.1,-0.2,9.9,0.1,-0.2,3.0,0.3,0.1,-0.1
topic5,-26.5,0.1,-1.5,-0.3,-0.7,-1.4,-0.6,-0.2,-1.8,-0.9,0.0,0.0
topic6,-10.9,-0.5,19.9,-0.4,-0.9,-0.6,-0.2,-0.1,-1.4,-0.0,-0.0,-0.1
topic7,16.1,0.1,-18.0,0.8,0.8,-2.9,0.0,0.0,-1.8,-0.3,0.0,-0.1
topic8,34.3,0.1,5.6,-0.5,-0.5,0.3,-0.4,-0.4,3.2,-0.6,-0.0,-0.2
topic9,7.8,-0.3,16.6,1.5,-0.9,6.7,-0.5,-0.4,3.0,-0.5,-0.0,-0.0


In [45]:
deals.T.sum()

topic0    -11.9
topic1      7.5
topic2     12.7
topic3    -15.5
topic4     38.4
topic5    -33.8
topic6      4.8
topic7     -5.3
topic8     40.9
topic9     33.0
topic10   -29.9
topic11    50.4
topic12    13.3
topic13    46.1
topic14    25.7
topic15     5.8
dtype: float64

In [46]:
svd = TruncatedSVD(n_components=16, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, 
                                 columns=columns, 
                                 index=index)
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,...,0.007,-0.007,0.002,-0.036,-0.014,0.037
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,...,-0.004,0.036,0.043,-0.021,0.051,-0.042
sms2!,-0.03,-0.048,0.09,-0.067,0.091,-0.043,...,0.125,0.023,0.026,-0.02,-0.042,0.052
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,...,0.022,0.023,0.073,-0.046,0.022,-0.07
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,...,0.028,-0.009,0.027,0.034,-0.083,-0.021
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,...,0.041,0.055,-0.037,0.075,-0.001,0.02


## LSA for spam classification

- find the cosine similarity between vectors correlate with membership in the same class
- here we compute the dot product between the first six topic vectors for the first six SMS messages.
- observation: larger positive cosine similarity between any spam message

In [47]:
svd_topic_vectors = (svd_topic_vectors.T / np.linalg.norm(svd_topic_vectors, axis=1)).T
svd_topic_vectors.iloc[:10].dot(svd_topic_vectors[:10].T).round(1)

Unnamed: 0,sms0,sms1,sms2!,sms3,sms4,sms5!,sms6,sms7,sms8!,sms9!
sms0,1.0,0.6,-0.1,0.6,-0.0,-0.3,-0.3,-0.1,-0.3,-0.3
sms1,0.6,1.0,-0.2,0.8,-0.2,0.0,-0.2,-0.2,-0.1,-0.1
sms2!,-0.1,-0.2,1.0,-0.2,0.1,0.4,0.0,0.3,0.5,0.4
sms3,0.6,0.8,-0.2,1.0,-0.2,-0.3,-0.1,-0.3,-0.2,-0.1
sms4,-0.0,-0.2,0.1,-0.2,1.0,0.2,0.0,0.1,-0.4,-0.2
sms5!,-0.3,0.0,0.4,-0.3,0.2,1.0,-0.1,0.1,0.3,0.4
sms6,-0.3,-0.2,0.0,-0.1,0.0,-0.1,1.0,0.1,-0.2,-0.2
sms7,-0.1,-0.2,0.3,-0.3,0.1,0.1,0.1,1.0,0.1,0.4
sms8!,-0.3,-0.1,0.5,-0.2,-0.4,0.3,-0.2,0.1,1.0,0.3
sms9!,-0.3,-0.1,0.4,-0.1,-0.2,0.4,-0.2,0.4,0.3,1.0


## LDiA

Latent Dirichlet Allocation (LDiA). 

In [48]:
# Computes the mean number of words (n-grams) in all the bag of words for the documents in their corpus.
total_corpus_len = 0
for document_text in sms.text:
    total_corpus_len += len(casual_tokenize(document_text))
mean_document_len = total_corpus_len / len(sms)
round(mean_document_len, 2)

21.35

In [49]:
# One liner.
sum([len(casual_tokenize(t)) for t in sms.text]) / len(sms.text)

21.34794293983874

In [52]:
np.random.seed(42)

counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(), 
                                     counter.vocabulary_.keys())))
bow_docs.columns = terms

In [54]:
sms.loc['sms0'].text

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [55]:
bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head()

,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64

In [59]:
# Create topic vectors for your SMS Corpus using LDiA.
ldia = LDiA(n_components=16, 
            learning_method='batch')
ldia = ldia.fit(bow_docs)
ldia.components_.shape

(16, 9232)

In [60]:
pd.set_option('display.width', 75)
components = pd.DataFrame(ldia.components_.T, 
                          index=terms,
                          columns=columns)

In [61]:
components.round(2).head(3)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
!,105.81,109.5,0.07,91.84,48.99,153.85,...,0.06,120.43,380.43,4.73,11.49,0.06
"""",0.06,1.81,0.06,0.06,10.07,1.92,...,111.35,10.18,29.48,0.06,0.06,0.06
#,0.06,0.06,0.06,0.06,0.06,2.7,...,0.06,1.71,0.06,0.06,0.06,0.06


In [62]:
components.topic3.sort_values(ascending=False)[:10]

!       91.835649
.       86.173222
you     42.546065
£       30.304129
or      30.170413
to      27.497113
,       26.955386
and     22.574612
have    20.047433
in      19.448555
Name: topic3, dtype: float64

In [63]:
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors, 
                                    index=index,
                                    columns=columns)
ldia16_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.0,0.0,0.32,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.64,0.0
sms1,0.01,0.01,0.9,0.01,0.01,0.01,...,0.01,0.01,0.01,0.01,0.01,0.01
sms2!,0.37,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.18,0.76,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.3,0.0,0.64,0.0,0.0,0.0


## LDiA + LDA = spam classifier

In [67]:
X_train, X_test, y_train, y_test = train_test_split(ldia16_topic_vectors, 
                                                    sms.spam, 
                                                    test_size=0.5,
                                                    random_state=271828)

In [68]:
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia16_spam'] = lda.predict(ldia16_topic_vectors)
round(float(lda.score(X_test, y_test)), 2)



0.94

In [73]:
word_list = list('abc')
all_pairs = [(word1, word2)
             for (word1, word2)
             in product(word_list, word_list)
             if not word1 == word2]
all_pairs

[('a', 'b'), ('a', 'c'), ('b', 'a'), ('b', 'c'), ('c', 'a'), ('c', 'b')]

In [75]:
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
tfidf_docs = tfidf_docs - tfidf_docs.mean(axis=0)

X_train, X_test, y_train, y_test = train_test_split(tfidf_docs,
                                                    sms.spam.values,
                                                    test_size=0.5,
                                                    random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
round(float(lda.score(X_train, y_train)), 3)



1.0

In [76]:
round(float(lda.score(X_test, y_test)), 3)

0.748

## 32 LDiA topics

In [77]:
ldia32 = LDiA(n_components=32, 
              learning_method='batch')
ldia32 = ldia32.fit(bow_docs)
ldia32.components_.shape

(32, 9232)

In [78]:
ldia32_topic_vectors = ldia32.transform(bow_docs)
columns32 = ['topic{}'.format(i)
             for i in range(ldia32.n_components)]
ldia32_topic_vectors = pd.DataFrame(ldia32_topic_vectors, 
                                    index=index,
                                    columns=columns32)
ldia32_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic26,topic27,topic28,topic29,topic30,topic31
sms0,0.0,0.72,0.14,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.05,0.0
sms1,0.0,0.76,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms2!,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.81,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.69,0.0,0.0,0.0,0.0,0.0


In [79]:
X_train, X_test, y_train, y_test = train_test_split(ldia32_topic_vectors,
                                                    sms.spam,
                                                    test_size=0.5,
                                                    random_state=271828)

In [80]:
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia32_spam'] = lda.predict(ldia32_topic_vectors)
X_train.shape



(2418, 32)

In [81]:
round(float(lda.score(X_train, y_train)), 3)

0.907

In [82]:
round(float(lda.score(X_test, y_test)), 3)

0.914

## Distance and Similarity

Typical conversions between distance and similarity:

```
similarity = 1 / (1 + distance)
distance = 1 / similarity - 1
```

If the distance and similarity scores ranges between 0 and 1, like probabilities, it is more common to use formula like this:

```
similarity = 1 - distance
distance = 1 - similarity
```

Angular distance:
```python
import math
angular_distance = math.acos(cosine_similarity) / math.pi
distance = 1 / similarity - 1
similarity = 1 - distance
```

In [88]:
lda = LDA(n_components=1)
lda = lda.fit(tfidf_docs, sms.spam)
sms['lda_spamminess'] = lda.predict(tfidf_docs)

((sms.spam - sms.lda_spamminess) ** 2).sum() ** 0.5



0.0

In [89]:
(sms.spam == sms.lda_spamminess).sum()

4837

In [None]:
lda = LDA(n_components=1)
scores = cross_val_score(lda, tfidf_docs, sms.spam, cv=5)
f'Accuracy: {scores.mean():.2f} (+/-{scores.std() * 2:.2f})'



In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, 
                                                    sms.spam,
                                                    test_size=0.33,
                                                    random_state=271828)

In [None]:
lda = LDA(n_components=1)
lda.fit(X_train, y_train)
lda.score(X_test, y_test).round(3)

In [None]:
lda = LDA(n_components=1)
scores = cross_val_score(lda, pca_topicvectors, sms.spam, cv=10)
f'Accuracy: {scores.mean():.3f} (+/-{scores.std() * 2:.3f})'