In [157]:
import numpy as np
import pandas as pd
import spacy

# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

## Pre-processing - create word vectors using the spacy large vector model

In [158]:
#Word level vectors
text = "These vectors are great"
doc = nlp(text)

with nlp.disable_pipes():
    vectors = np.array([token.vector for token in doc])

print(vectors.shape)
for token in doc:
    print(f"{token.text} \t\t\t {token.lemma_} \t\t\t {token.is_stop}")

(4, 300)
These 			 these 			 True
vectors 			 vector 			 False
are 			 be 			 True
great 			 great 			 False


In [159]:
#doc level vectors
review_comments_file = ["first sentence", "second sentence"]
review_sentiment = [0,1]

with nlp.disable_pipes():
    v2 = np.array([nlp(row).vector for row in review_comments_file])
    #print(np.array([nlp(row).vector for row in file]))
#print(v2)

## Load the yelp ratings 

In [160]:
# Load in the data from JSON file
data = pd.read_csv('yelp_ratings.csv')
data.head()

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


In [161]:
#Test the vector creation from actual data

review_verbatim = data.text[:5]
review_sentiment = data.sentiment[:5]

#Get the word vectors for the doc into an array
with nlp.disable_pipes():
    review_vectors = np.array([nlp(row).vector for row in review_verbatim])

print(review_vectors.shape)



(5, 300)


In [162]:
from sklearn.model_selection import train_test_split
import time

#Since creating vectors on a dataset of 44k rows will take time, this task was done offline and stored in a file
# Loading all document vectors from file into an array
review_vectors = np.load('review_vectors.npy')

#Use the word vectors to create train/test data
X_train, X_test, y_train, y_test = train_test_split(review_vectors, data.sentiment, test_size=0.1, random_state=1)

print("training / test data created")

training / test data created


## Train a few different models

In [163]:
from sklearn.svm import LinearSVC

start_time = time.time()
# Create the LinearSVC model
mSVC = LinearSVC(random_state=1, dual=False)
# Fit the model on training data
mSVC.fit(X_train, y_train)

# Uncomment and run to see model accuracy
print(f'Model test accuracy: {mSVC.score(X_test, y_test)*100:.3f}%')

print("--- %s seconds ---" % (time.time() - start_time))


Model test accuracy: 93.847%
--- 2.8593268394470215 seconds ---


In [164]:
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()

mKNN = KNeighborsClassifier(n_neighbors=5)
mKNN.fit(X_train, y_train)
print(f'Model test accuracy: {mKNN.score(X_test, y_test)*100:.3f}%')

print("--- %s seconds ---" % (time.time() - start_time))


Model test accuracy: 86.998%
--- 78.88813304901123 seconds ---


In [165]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

start_time = time.time()

mDTC = DecisionTreeClassifier()
mDTC.fit(X_train, y_train)
y_pred = mDTC.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(metrics.accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

print("--- %s seconds ---" % (time.time() - start_time))


0.8120368291039749
[[ 718  408]
 [ 429 2898]]
              precision    recall  f1-score   support

           0       0.63      0.64      0.63      1126
           1       0.88      0.87      0.87      3327

    accuracy                           0.81      4453
   macro avg       0.75      0.75      0.75      4453
weighted avg       0.81      0.81      0.81      4453

--- 22.56112003326416 seconds ---


In [166]:
good_review = """I like the food, and i Like the people.."""

bad_review = """this is not a good restaurant. No good food, location was just ok."""

#Create word vectors for the reviews and reshape since there is only 1 sample
my_good_review = (nlp(good_review).vector).reshape(1,-1)
my_good_label = [1]

my_bad_review = (nlp(bad_review).vector).reshape(1,-1)
my_bad_label = [0]

print(f'SVC model accuracy for good: {mSVC.score(my_good_review, my_good_label)*100}%')
print(f'SVC model accuracy for bad: {mSVC.score(my_bad_review, my_bad_label)*100}%')

print(f'KNN Model test accuracy for good: {mKNN.score(my_good_review, my_good_label)*100}%')
print(f'KNN Model test accuracy for bad: {mKNN.score(my_bad_review, my_bad_label)*100:.1f}%')

print(f"DTC model accuracy for good: {metrics.accuracy_score(my_good_label, mDTC.predict(my_good_review))*100:.1f}%")
print(f"DTC model accuracy for bad: {metrics.accuracy_score(my_bad_label, mDTC.predict(my_bad_review))*100:.1f}%")



SVC model accuracy for good: 100.0%
SVC model accuracy for bad: 100.0%
KNN Model test accuracy for good: 0.0%
KNN Model test accuracy for bad: 0.0%
DTC model accuracy for good: 0.0%
DTC model accuracy for bad: 0.0%


In [167]:
def cosine_similarity(a, b):
    return np.dot(a, b)/np.sqrt(a.dot(a)*b.dot(b))

#Lets test similarity first
a = nlp("I absolutely love this place. The 360 degree glass windows with the Yerba buena garden view, tea pots all around and").vector
b = nlp("The space is a dark hole").vector
print(f"similarity between a & b is {cosine_similarity(a, b)*100:.1f}%")


similarity between a & b is 77.1%


## Similarity exercise (not model related)

In [168]:
#Now lets find similarity of 1 review to other reviews in file. 
# This exercise does not rely on the trained models above, just on the vectors defined above.
target_review = """I absolutely love this place. The 360 degree glass windows with the 
Yerba buena garden view, tea pots all around and the smell of fresh tea everywhere 
transports you to what feels like a different zen zone within the city. I know 
the price is slightly more compared to the normal American size, however the food 
is very wholesome, the tea selection is incredible and I know service can be hit 
or miss often but it was on point during our most recent visit. Definitely recommend!
I would especially recommend the butternut squash gyoza."""

#get the word vector for the target review
review_vec = nlp(target_review).vector

## Center the document vectors
#To calculate similarity of this review ot other reviews, we need to make sure we are comparing to reviews in this file
#In order to make sure we are comparing reviews to other reviews within the file, 
# we "center" the reviews, by finding the mean of all reviews in the file
# Calculate the mean for the document vectors, should have shape (300,)
vec_mean = review_vectors.mean(axis=0)
# Subtract the mean from the vectors
centered = review_vectors - vec_mean
print(centered.shape)


# Calculate similarities for each document in the dataset
# Make sure to subtract the mean from the review vector
sims = np.array([cosine_similarity(review_vec-vec_mean, v) for v in centered])

# Get the index for the most similar document
most_similar = sims.argmax()

#Lets see the text for most similar review and check for ourselves
print(f"Most similar review == {sims[sims.argmax()]*100:.3f}% -- {data.iloc[most_similar].text}")

#print(sims)



(44530, 300)
Most similar review == 59.152% -- After purchasing my final christmas gifts at the Urban Tea Merchant in Vancouver, I was surprised to hear about Teopia at the new outdoor mall at Don Mills and Lawrence when I went back home to Toronto for Christmas.
Across from the outdoor skating rink and perfect to sit by the ledge to people watch, the location was prime for tea connesieurs... or people who are just freezing cold in need of a drinK!
Like any gourmet tea shop, there were large tins of tea leaves on the walls, and although the tea menu seemed interesting enough, you can get any specialty tea as your drink. We didn't know what to get... so the lady suggested the Goji Berries... it smelled so succulent and juicy... instantly SOLD! I got it into a tea latte and watched the tea steep while the milk was steamed, and surprisingly, with the click of a button, all the water from the tea can be instantly drained into the cup (see photo).. very fascinating!

The tea was aromatic an

In [169]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import re
import nltk

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    
tsne_plot(model)

AttributeError: 'LinearSVC' object has no attribute 'wv'