In [1]:
# ICS 438 Final Project:
# Jerome Gallego, Taylor Wong, Ujjwal Gautam
# December 13, 2022
#
# For this project we have decided to use a dataset made from yelp reviews. 
# Using the operation known as Sentiment Analysis, we can provide a detailed investigation on whether or not the star ratings
# -can reflect how positive, negative, or neutral each review is.

# In the yelp dataset, we will only be analyzing the star ratings and the reviews itself. Everything else can be considered irrelevant

#For this notebook please install these packages to ensure that the file is running correctly
%pip install -U gensim
%pip install nltk



# Import whatever libraries you would want to use
# Clean up cells to put all imports to the top

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np
# import gensim
# from gensim.parsing.preprocessing import remove_stopwords
# import re

In [3]:
## Before we can do any kind of analysis we will need to load the dataset.
# As instructed, we understand that loading a file into ram can seem inefficient,
# to overcome this obstacle we have decided to process the data using the batching method that we have learned from Mahdi.

## Process yelp.csv with chunk size of 50 and append it to the dataframe


df = pd.DataFrame()
with open('./data/yelp.csv', "r+") as csv_file:
    tp = pd.read_csv(csv_file, iterator=True, chunksize=50)
    df = pd.concat(tp, ignore_index=True) 
# df.shape
# df.head()
# df['type'].describe
# df.info


In [4]:
columns = df.columns.values.tolist()
print(columns)
df['text'][0]

['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id', 'cool', 'useful', 'funny']


'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!'

In [5]:
## To ensure we have removed all the irrelevant columns, we used the function drop() which will tell
## the dataframe to only include the stars and the text review. 
df.drop(labels=["business_id", "date", "type", "review_id", "user_id", "cool", "useful", "funny"], axis=1, inplace=True)
df

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,3,First visit...Had lunch here today - used my G...
9996,4,Should be called house of deliciousness!\n\nI ...
9997,4,I recently visited Olive and Ivy for business ...
9998,2,My nephew just moved to Scottsdale recently so...


In [6]:
## Cleaning the data
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import string
def clean_data(data):
    x = data.lower()
    x = x.replace('\n','')
    table = str.maketrans(dict.fromkeys(string.punctuation)) 
    x = x.translate(table)
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    return x

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
##apply clean_data function to the df["text"] column to remove punctuations, new lines, and stop words
df["text"] = df["text"].apply(clean_data)
df

Unnamed: 0,stars,text
0,5,wife took birthday breakfast excellent weathe...
1,5,idea people give bad reviews place goes show p...
2,4,love gyro plate rice good also dig candy selec...
3,5,rosie dakota love chaparral dog park convenien...
4,5,general manager scott petello good egg go deta...
...,...,...
9995,3,first visithad lunch today used groupon orde...
9996,4,called house deliciousnessi could go item item...
9997,4,recently visited olive ivy business last week ...
9998,2,nephew moved scottsdale recently bunch friends...


In [8]:
# As you can see there are 10000 entries in the file. By using Pandas Dataframe, it is good to know that we will not be able to
# -completely store all the data in RAM. With that being said to make it seem more realistic, we will cut down the dataframe
# -to have only 1000 randomly selected reviews. 
# Again, in some situations that can be too much for a machine to handle. To prevent any crashes or errors, we will be
# -batching the reviews in a set of 50 at a time.
# Thanks to the help of Assignment 2, we understand how to batch a set.

In [9]:
# Using sample() will randomly select 1000 reviews for analysis
df = df.sample(n=1000)
df

Unnamed: 0,stars,text
6967,4,simply put grind provided one top five burgers...
3152,5,disgruntled reviews read sapporo things say1 ...
834,2,uhhhthese supposed carne fries asu boys think ...
9217,5,eat 3 times week antipasto salad bread addicti...
6094,3,love eat however hit miss sometimes get good s...
...,...,...
4809,3,great see total wine closer live large select...
5260,3,want really filling delicious pleasemakemyhead...
2439,5,going nails 101 long time love manicure done m...
6673,4,hubby go almost ever friday happy hour love ch...


In [10]:
# After we have cleaned the data to remove any sort of stop words and characters, we can start to implement the Sentiment Analysis.
# The main goal for this is produce a score from 0 to 1 whether it is categorized as Positive, Negative, or Neutral.

### Vader lexicon
Vader lexicon is a rule-based sentiment analysis tool specifically made for social media sentiment. 

### Why we chose vader lexicon
We decided to go with vader lexicon since the way vader lexicon was designed is meant for review analysis. On their GitHub page, examples of the training data used include 'The service here is extremely good', and 'The service here is go

In [13]:
nltk.download('vader_lexicon') #WE NEED TO EXPLAIN WHAT THIS IS
from nltk.sentiment.vader import SentimentIntensityAnalyzer #WE NEED TO EXPLAIN WHAT THIS IS ALSO

for index, row in df['text'].iteritems():
    result = SentimentIntensityAnalyzer().polarity_scores(row)
    # if index%50 == 0:
    if result['neg'] > result['pos']:
        df.loc[index, "Sentiment"] = "negative"
    elif result['pos'] > result['neg']:
        df.loc[index, "Sentiment"] = "positive"
    else:
        df.loc[index, "Sentiment"] = "neutral"
        
    df.loc[index, 'neg'] = result['neg']
    df.loc[index, 'neu'] = result['neu']
    df.loc[index, 'pos'] = result['pos']
    df.loc[index, 'compound'] = result['compound']
df

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  for index, row in df['text'].iteritems():


Unnamed: 0,stars,text,Sentiment,neg,neu,pos,compound
6967,4,simply put grind provided one top five burgers...,positive,0.071,0.747,0.183,0.8854
3152,5,disgruntled reviews read sapporo things say1 ...,positive,0.157,0.605,0.237,0.8625
834,2,uhhhthese supposed carne fries asu boys think ...,negative,0.176,0.649,0.175,-0.2732
9217,5,eat 3 times week antipasto salad bread addicti...,neutral,0.000,1.000,0.000,0.0000
6094,3,love eat however hit miss sometimes get good s...,positive,0.098,0.556,0.346,0.9517
...,...,...,...,...,...,...,...
4809,3,great see total wine closer live large select...,positive,0.000,0.711,0.289,0.7845
5260,3,want really filling delicious pleasemakemyhead...,positive,0.102,0.603,0.295,0.8230
2439,5,going nails 101 long time love manicure done m...,positive,0.000,0.641,0.359,0.8555
6673,4,hubby go almost ever friday happy hour love ch...,positive,0.038,0.549,0.414,0.9777


In [12]:
# one_stars = df[df['stars'] == 1]
# one_stars.head()