In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np

In [2]:
# read in, then preview the data

df = pd.read_csv("universal_studios_reviews.csv")
df.head()

Unnamed: 0,reviewer,rating,written_date,title,review_text,branch
0,Kelly B,2.0,"May 30, 2021",Universal is a complete Disaster - stick with ...,We went to Universal over Memorial Day weekend...,Universal Studios Florida
1,Jon,1.0,"May 30, 2021",Food is hard to get.,The food service is horrible. I’m not reviewin...,Universal Studios Florida
2,Nerdy P,2.0,"May 30, 2021",Disappointed,I booked this vacation mainly to ride Hagrid m...,Universal Studios Florida
3,ran101278,4.0,"May 29, 2021",My opinion,When a person tries the test seat for the ride...,Universal Studios Florida
4,tammies20132015,5.0,"May 28, 2021",The Bourne Stuntacular...MUST SEE,"Ok, I can't stress enough to anyone and everyo...",Universal Studios Florida


In [3]:
# copy the df for manipulation

revs = df.copy()

# assign reviews with rating > 3 positive sentiment (1), else negative sentiment (-1)

revs['sentiment'] = revs['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [4]:
# remove punctuation from review text

def remove_punct(text):
    txt = "".join(char for char in text if char not in ("?", ".", ";", ":",  "!",'"'))
    return text

revs['review_text'] = revs['review_text'].apply(remove_punct)

In [5]:
# split dataset into training and test sets

feature = revs[["review_text"]]
label = revs[["sentiment"]]

Xtrain, Xtest, ytrain, ytest = train_test_split(feature, label, test_size=0.2, random_state=3)

In [7]:
# vectorize review text

vect = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range = (1,2))

train_set = vect.fit_transform(Xtrain['review_text'])
test_set = vect.transform(Xtest['review_text'])

In [10]:
# create svm classifier

svm = SGDClassifier(random_state=3, max_iter=200, class_weight = "balanced").fit(train_set, ytrain)

# make predictions on test data, then generate classification report

ypred_test = svm.predict(test_set)
svm_cr = classification_report(ypred_test, ytest)
print(svm_cr)

  return f(*args, **kwargs)


              precision    recall  f1-score   support

          -1       0.67      0.73      0.70      1638
           1       0.95      0.93      0.94      8543

    accuracy                           0.90     10181
   macro avg       0.81      0.83      0.82     10181
weighted avg       0.90      0.90      0.90     10181



In [11]:
# create logistic regression classifier

lr = LogisticRegression(random_state = 3, max_iter=200, solver='liblinear', class_weight = "balanced").fit(train_set, ytrain)

# make predictions on test data, then generate classification report

ypred_test = lr.predict(test_set)
lr_cr = classification_report(ypred_test, ytest)
print(lr_cr)

  return f(*args, **kwargs)


              precision    recall  f1-score   support

          -1       0.72      0.72      0.72      1805
           1       0.94      0.94      0.94      8376

    accuracy                           0.90     10181
   macro avg       0.83      0.83      0.83     10181
weighted avg       0.90      0.90      0.90     10181

