In [3]:
import spacy
import numpy as np
import pandas as pd

In [4]:
# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

In [10]:
# Disabling other pipes because we don't need them and it'll speed up this part a bit
text = "These vectors can be used as features for machine learning models."

with nlp.disable_pipes():
    vectors = np.array([token.vector for token in nlp(text)])

vectors.shape
# for token in nlp(text):
# #     print(token.vector)
#     print(f"{token}\t\t\t{token.lemma_}\t\t\t{token.vector.shape}")


(12, 300)

In [11]:
spam = pd.read_csv('spam.csv')
# spam.head()


In [12]:

with nlp.disable_pipes():
    doc_vectors = np.array([ nlp(text).vector for text in (spam.text)])
c.shape

(5572, 300)

In [14]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.label,
                                                    test_size=0.1, random_state=1)

In [20]:
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)

print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Accuracy: 97.312%


# Document similarity

Documents with similar content generally have similar vectors. So you can find similar documents by measuring the similarity between the vectors. A common metric for this is the cosine similarity which measures the angle between two vectors,  𝐚  and  𝐛 .

cos𝜃=𝐚⋅𝐛‖𝐚‖‖𝐛‖
 
This is the dot product of  𝐚  and  𝐛 , divided by the magnitudes of each vector. The cosine similarity can vary between -1 and 1, corresponding complete opposite to perfect similarity, respectively. To calculate it, you can use the metric from scikit-learn or write your own function

In [21]:
a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.").vector


def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))

cosine_similarity(a, b)

0.7030031