In [6]:
# ICS 438 Final Project:
# Jerome Gallego, Taylor Wong, Ujjwal Gautam
# December 13, 2022
#
# For this project we have decided to use a dataset made from yelp reviews. 
# Using the operation known as Sentiment Analysis, we can provide a detailed investigation on whether or not the star ratings
# -can reflect how positive, negative, or neutral each review is.

# In the yelp dataset, we will only be analyzing the star ratings and the reviews itself. Everything else can be considered irrelevant

#For this notebook please install these packages to ensure that the file is running correctly
#%pip install -U gensim
%pip install nltk



# Import whatever libraries you would want to use
# Clean up cells to put all imports to the top

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m770.5/770.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.7 regex-2022.10.31 tqdm-4.64.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np
# import gensim
# from gensim.parsing.preprocessing import remove_stopwords
# import re

In [8]:
## Before we can do any kind of analysis we will need to load the dataset.
# As instructed, we understand that loading a file into ram can seem inefficient,
# to overcome this obstacle we have decided to process the data using the batching method that we have learned from Mahdi.

## Process yelp.csv with chunk size of 50 and append it to the dataframe


df = pd.DataFrame()
with open('./data/yelp.csv', "r+") as csv_file:
    tp = pd.read_csv(csv_file, iterator=True, chunksize=50)
    df = pd.concat(tp, ignore_index=True) 
# df.shape
# df.head()
# df['type'].describe
# df.info


In [9]:
columns = df.columns.values.tolist()
print(columns)
df['text'][0]

['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id', 'cool', 'useful', 'funny']


'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!'

In [10]:
## To ensure we have removed all the irrelevant columns, we used the function drop() which will tell
## the dataframe to only include the stars and the text review. 
df.drop(labels=["business_id", "date", "type", "review_id", "user_id", "cool", "useful", "funny"], axis=1, inplace=True)
df

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,3,First visit...Had lunch here today - used my G...
9996,4,Should be called house of deliciousness!\n\nI ...
9997,4,I recently visited Olive and Ivy for business ...
9998,2,My nephew just moved to Scottsdale recently so...


In [11]:
## Cleaning the data
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import string
def clean_data(data):
    x = data.lower()
    x = x.replace('\n','')
    table = str.maketrans(dict.fromkeys(string.punctuation)) 
    x = x.translate(table)
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    return x

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
##apply clean_data function to the df["text"] column to remove punctuations, new lines, and stop words
df["text"] = df["text"].apply(clean_data)
df

Unnamed: 0,stars,text
0,5,wife took birthday breakfast excellent weathe...
1,5,idea people give bad reviews place goes show p...
2,4,love gyro plate rice good also dig candy selec...
3,5,rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello good egg go deta...
...,...,...
9995,3,first visithad lunch today used groupon orde...
9996,4,called house deliciousnessi could go item item...
9997,4,recently visited olive ivy business last week ...
9998,2,nephew moved scottsdale recently bunch friends...


In [13]:
# As you can see there are 10000 entries in the file. By using Pandas Dataframe, it is good to know that we will not be able to
# -completely store all the data in RAM. With that being said to make it seem more realistic, we will cut down the dataframe
# -to have only 1000 randomly selected reviews. 
# Again, in some situations that can be too much for a machine to handle. To prevent any crashes or errors, we will be
# -batching the reviews in a set of 50 at a time.
# Thanks to the help of Assignment 2, we understand how to batch a set.

In [14]:
# Using sample() will randomly select 1000 reviews for analysis
df = df.sample(n=1000)
df

Unnamed: 0,stars,text
5497,4,new favorite mexican place tempe love tacos me...
7768,5,saw small door sign intrigued went inside tha...
8733,3,zoes gets solid 3 stars bc raddest sinfully de...
1026,5,passed velvet rope flanked shiny exotic cars k...
4099,4,decided come back give place another try lot...
...,...,...
8384,1,addition sucking also expensive
3835,5,ive brought dogs met dr holmes dr edwards extr...
2778,3,went morning breakfast bf use living social de...
6411,3,meh met friends sat patio saturday night arou...


In [15]:
# After we have cleaned the data to remove any sort of stop words and characters, we can start to implement the Sentiment Analysis.
# The main goal for this is produce a score from 0 to 1 whether it is categorized as Positive, Negative, or Neutral.

### Vader lexicon
Vader lexicon is a rule-based sentiment analysis tool specifically made for social media sentiment. For a more in-depth documentation please refer to their GitHub: https://github.com/cjhutto/vaderSentiment

### Why we chose vader lexicon
We decided to go with vader lexicon since the way vader lexicon was designed is meant for review analysis. On their GitHub page, examples of the training data used include 'The service here is extremely good', and 'The service here is good'. 

In [16]:
nltk.download('vader_lexicon') #WE NEED TO EXPLAIN WHAT THIS IS
from nltk.sentiment.vader import SentimentIntensityAnalyzer #WE NEED TO EXPLAIN WHAT THIS IS ALSO

for index, row in df['text'].iteritems():
    result = SentimentIntensityAnalyzer().polarity_scores(row)
    # if index%50 == 0:
    if result['neg'] > result['pos']:
        df.loc[index, "Sentiment"] = "negative"
    elif result['pos'] > result['neg']:
        df.loc[index, "Sentiment"] = "positive"
    else:
        df.loc[index, "Sentiment"] = "neutral"
        
    df.loc[index, 'neg'] = result['neg']
    df.loc[index, 'neu'] = result['neu']
    df.loc[index, 'pos'] = result['pos']
    df.loc[index, 'compound'] = result['compound']
df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
  for index, row in df['text'].iteritems():


Unnamed: 0,stars,text,Sentiment,neg,neu,pos,compound
5497,4,new favorite mexican place tempe love tacos me...,positive,0.000,0.567,0.433,0.9842
7768,5,saw small door sign intrigued went inside tha...,positive,0.049,0.711,0.240,0.9723
8733,3,zoes gets solid 3 stars bc raddest sinfully de...,positive,0.054,0.633,0.313,0.9702
1026,5,passed velvet rope flanked shiny exotic cars k...,positive,0.028,0.728,0.244,0.9958
4099,4,decided come back give place another try lot...,positive,0.000,0.614,0.386,0.9592
...,...,...,...,...,...,...,...
8384,1,addition sucking also expensive,neutral,0.000,1.000,0.000,0.0000
3835,5,ive brought dogs met dr holmes dr edwards extr...,positive,0.090,0.750,0.160,0.7620
2778,3,went morning breakfast bf use living social de...,positive,0.055,0.635,0.310,0.9935
6411,3,meh met friends sat patio saturday night arou...,positive,0.014,0.714,0.272,0.9661


In [28]:
low_rate = df[(df['stars'] == 1) | (df['stars'] == 2)]
low_rate

Unnamed: 0,stars,text,Sentiment,neg,neu,pos,compound
4165,1,omg rave place disgusting dirty food worth cam...,negative,0.265,0.550,0.185,-0.6551
6033,2,pretty excited burger joint opening since pres...,positive,0.000,0.491,0.509,0.9747
2594,1,rating based fact vegetarian charge 8 small me...,positive,0.021,0.795,0.184,0.8977
9435,1,get oil changed walmart walmart got oil change...,negative,0.102,0.800,0.098,-0.0516
7489,2,clearly musicians make night night hot half...,positive,0.083,0.665,0.252,0.9807
...,...,...,...,...,...,...,...
9428,2,mehmaybe decided dine 10 minutes prior closing...,positive,0.184,0.628,0.188,0.5430
3082,1,currently liquidation sale really worth trip ...,positive,0.108,0.691,0.201,0.4742
6322,2,ever lean cuisine chicken enchiladas ever supe...,negative,0.150,0.749,0.102,-0.2716
4766,1,bad food,negative,0.778,0.222,0.000,-0.5423


In [29]:
high_rate = df[(df['stars'] == 4) | (df['stars'] == 5)]
high_rate

Unnamed: 0,stars,text,Sentiment,neg,neu,pos,compound
5497,4,new favorite mexican place tempe love tacos me...,positive,0.000,0.567,0.433,0.9842
7768,5,saw small door sign intrigued went inside tha...,positive,0.049,0.711,0.240,0.9723
1026,5,passed velvet rope flanked shiny exotic cars k...,positive,0.028,0.728,0.244,0.9958
4099,4,decided come back give place another try lot...,positive,0.000,0.614,0.386,0.9592
7253,5,progressive inclusive welcoming diverse intell...,positive,0.051,0.676,0.273,0.9260
...,...,...,...,...,...,...,...
2464,4,cannot speak food though menu looked unique ta...,positive,0.009,0.720,0.271,0.9961
6220,4,burgers fries pretty good custard sundaes conc...,positive,0.000,0.427,0.573,0.9442
7923,4,delicious im frosting sprinkles delivers big ...,positive,0.000,0.671,0.329,0.9798
3835,5,ive brought dogs met dr holmes dr edwards extr...,positive,0.090,0.750,0.160,0.7620


In [31]:
neutral = df[(df['stars'] == 3)]
neutral

Unnamed: 0,stars,text,Sentiment,neg,neu,pos,compound
8733,3,zoes gets solid 3 stars bc raddest sinfully de...,positive,0.054,0.633,0.313,0.9702
7671,3,tried place reading reviews rice noodle soup ...,positive,0.058,0.833,0.109,0.5586
7972,3,noticed sign shakey jakes even opened waited p...,positive,0.115,0.671,0.214,0.9913
2474,3,fast drive thru,neutral,0.000,1.000,0.000,0.0000
3205,3,really come fair concert want see fun walk aro...,positive,0.000,0.645,0.355,0.9576
...,...,...,...,...,...,...,...
7867,3,eyebrows touchy subject im literally convinced...,positive,0.076,0.605,0.319,0.9710
3466,3,woke late saturday craving indian food relief ...,positive,0.030,0.727,0.243,0.9579
4855,3,pricey fave place shop vintage wares decent peak,positive,0.000,0.707,0.293,0.4404
2778,3,went morning breakfast bf use living social de...,positive,0.055,0.635,0.310,0.9935
