In [11]:
import numpy as np
import pandas as pd
import spacy

# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

In [32]:
#Word level vectors
text = "These vectors are great"
doc = nlp(text)

with nlp.disable_pipes():
    vectors = np.array([token.vector for token in doc])

print(vectors.shape)
for token in doc:
    print(f"{token.text} \t\t\t {token.lemma_} \t\t\t {token.is_stop}")

(4, 300)
These 			 these 			 True
vectors 			 vector 			 False
are 			 be 			 True
great 			 great 			 False


In [40]:
#doc level vectors
review_comments_file = ["first sentence", "second sentence"]
review_sentiment = [0,1]

with nlp.disable_pipes():
    v2 = np.array([nlp(row).vector for row in review_comments_file])
    #print(np.array([nlp(row).vector for row in file]))
#print(v2.shape)

In [13]:
# Load in the data from JSON file
data = pd.read_csv('yelp_ratings.csv')
data.head()

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


In [50]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

review_verbatim = data.text[:500]
review_sentiment = data.sentiment[:500]
with nlp.disable_pipes():
    review_vectors = np.array([nlp(row).vector for row in review_verbatim])

print(review_vectors.shape)

X_train, X_test, y_train, y_test = train_test_split(review_vectors, review_sentiment, test_size=0.1, random_state=1)

# Create the LinearSVC model
model = LinearSVC(random_state=1, dual=False)
# Fit the model
model.fit(X_train, y_train)

# Uncomment and run to see model accuracy
print(f'Model test accuracy: {model.score(X_test, y_test)*100:.3f}%')


(500, 300)
Model test accuracy: 86.000%


In [51]:
from sklearn.neighbors import KNeighborsClassifier

second_model = KNeighborsClassifier(n_neighbors=5)
second_model.fit(X_train, y_train)
print(f'Model test accuracy: {second_model.score(X_test, y_test)*100:.3f}%')

Model test accuracy: 82.000%


In [52]:
from sklearn.tree import DecisionTreeClassifier

mDTC = DecisionTreeClassifier()
mDTC.fit(X_train, y_train)
y_pred = mDTC.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 8  7]
 [ 7 28]]
              precision    recall  f1-score   support

           0       0.53      0.53      0.53        15
           1       0.80      0.80      0.80        35

    accuracy                           0.72        50
   macro avg       0.67      0.67      0.67        50
weighted avg       0.72      0.72      0.72        50



## Similarity