In [2]:
import numpy as np
import spacy

nlp = spacy.load('en_core_web_lg')

In [3]:
text = "These vectors can be used as features for machine learning models."

with nlp.disable_pipes():
    vectors = np.array([token.vector for token in nlp(text)])

In [5]:
vectors.shape

(12, 300)

In [9]:
import pandas as pd

spam = pd.read_csv('./data/spam.csv')

with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.text])
    
doc_vectors.shape

(5572, 300)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(doc_vectors, spam.label, test_size = 0.1, random_state = 1)

In [13]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state = 1, dual = False, max_iter = 10000)
svc.fit(X_train, y_train)
print(svc.score(X_valid, y_valid))

0.9767025089605734


In [14]:
def cosine_similarity(a, b):
    return a.dot(b) / np.sqrt(a.dot(a) * b.dot(b))

In [15]:
a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew"
        "into his pot of boiling water.").vector

cosine_similarity(a, b)

0.7104742

In [16]:
import matplotlib.pyplot as plt

In [17]:
review_data = pd.read_csv('./data/yelp_ratings.csv')
review_data.head()

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


In [19]:
reviews = review_data[:100]

with nlp.disable_pipes():
    vectors = np.array([
        nlp(review.text).vector for idx, review in reviews.iterrows()
    ])
    
vectors.shape

(100, 300)

In [20]:
# a bit too large to store on git
vectors = np.load('./data/review_vectors.npy')

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(vectors, review_data.sentiment, test_size = 0.1, random_state = 1)

model = LinearSVC(random_state = 1, dual = False)
model.fit(X_train, y_train)

LinearSVC(dual=False, random_state=1)

In [22]:
model.score(X_valid, y_valid)

0.9384684482371435

In [23]:
review = """I absolutely love this place. The 360 degree glass windows with the 
Yerba buena garden view, tea pots all around and the smell of fresh tea everywhere 
transports you to what feels like a different zen zone within the city. I know 
the price is slightly more compared to the normal American size, however the food 
is very wholesome, the tea selection is incredible and I know service can be hit 
or miss often but it was on point during our most recent visit. Definitely recommend!

I would especially recommend the butternut squash gyoza."""


review_vec = nlp(review).vector

In [27]:
### centering

reviews_mean = vectors.mean(axis = 0)

centered = vectors - reviews_mean
review_centered = review_vec - reviews_mean

similarities = np.array(
    [cosine_similarity(review_centered, vec) for vec in centered]
)

most_similar = similarities.argmax()

In [28]:
review_data.iloc[most_similar].text

"After purchasing my final christmas gifts at the Urban Tea Merchant in Vancouver, I was surprised to hear about Teopia at the new outdoor mall at Don Mills and Lawrence when I went back home to Toronto for Christmas.\nAcross from the outdoor skating rink and perfect to sit by the ledge to people watch, the location was prime for tea connesieurs... or people who are just freezing cold in need of a drinK!\nLike any gourmet tea shop, there were large tins of tea leaves on the walls, and although the tea menu seemed interesting enough, you can get any specialty tea as your drink. We didn't know what to get... so the lady suggested the Goji Berries... it smelled so succulent and juicy... instantly SOLD! I got it into a tea latte and watched the tea steep while the milk was steamed, and surprisingly, with the click of a button, all the water from the tea can be instantly drained into the cup (see photo).. very fascinating!\n\nThe tea was aromatic and tasty, not over powering. The price was 