## Notebook is aiming to create a list of restaurants that have 5 or more reviews
## Notebook also labels individual reviews that are very low compared to the restaurant's average

In [1]:
#import a bunch of libraries at the very start of the project, will modify as things go on
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from collections import Counter
from nltk import ngrams
from wordcloud import WordCloud
import matplotlib as mpl
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import json
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

%matplotlib inline
# Create a flag to use to run read_json or to load the DF from file.
loader = True

#Load analyzer if needed
#analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\trevo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Go through Json file and create output file if flag is False.
if loader == False:
    path_business = r"D:\yelp_data\yelp_academic_dataset_business.json"
    # business_df = pd.read_json(path_business)
    # business_df.head()
    #Load in the Yelp overview of the businesses in the dataset
    data_business = open(path_business)
    read_data_business = data_business.read()
    business_df = pd.read_json(read_data_business, lines = True)
    business_df.head()
    #Create a column to indicate whether or not a business is a restaurant
    #business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if 'Restaurants' in x['categories']  else 0,axis = 1)
    business_df.dropna(inplace=True)
    business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if ("Restaurants" in x['categories'])  else 0,axis = 1)
    business_df = business_df[business_df['restaurant_marker'] == 1]
    business_df.head()
    #Create a set of those businesses that are restaurants
    business_list = set(business_df.business_id.unique())
    print(business_df.shape)
    path_review = r"D:\yelp_data\yelp_academic_dataset_review.json"
    column_names = {"user_id":'',"stars":int(),"text":'',"business_id":''}
    review_df = pd.DataFrame(column_names, index = [])
    review_df["stars"] = review_df["stars"].astype("int8")
    #Load in the reviews dataset, identifying whether or not a review is for a restaurant
    review_holder = pd.read_json(path_review,lines = True, orient ="records",chunksize=100000) #, nrows = 4000000
    for i in review_holder:
        i = i[["user_id","stars","text","business_id"]]
        i.dropna(inplace=True)
        i['restaurant_marker'] = business_df.apply(lambda x: 1 if (x['business_id'] in business_list)  else 0,axis = 1)
        i = i[i['restaurant_marker'] == 1]
        i["stars"] = i["stars"].astype("int8")
        review_df = review_df.append(i, ignore_index = True)
    review_df.shape
    review_df.head()
    #Count the number of reviews left for each restaurant, limit to those with more than 5 reviews
    counts = review_df["business_id"].value_counts()
    counts = counts[counts > 5]
    counts
    review_df['five_plus'] = review_df.apply(lambda x: 1 if (x['business_id'] in counts.index)  else 0,axis = 1)
    #Keep only those restaurants with 5 or more reviews
    review_df = review_df[review_df['five_plus'] == 1]
    review_df.drop(['restaurant_marker', 'five_plus'],axis = 1, inplace = True)
    #Merge the reviews with the business DF
    review_df = pd.merge(review_df,business_df, on = 'business_id')
    #review_df.head()
    #business_df.head()
    #Drop un-needed columns
    review_df.drop(['address', 'city','state','postal_code','latitude','longitude','is_open','attributes','categories','hours','restaurant_marker'],axis = 1, inplace = True)
    review_df.head()
    #Save this review DF
    review_df.to_feather('formatted_reviews.ftr')


In [3]:
# Load previously created file if flag is true
if loader == True:
    second_df = pd.read_feather('formatted_reviews.ftr')

In [4]:
#Label those reviews that are 3 or more stars less than the restaurant's average
second_df['label'] = second_df.apply(lambda x: 1 if x['stars_y'] - x['stars_x'] >= 3 else 0,  axis = 1)
second_df.head()
second_df.shape

(23826, 8)

In [5]:
second_df.tail()

Unnamed: 0,user_id,stars_x,text,business_id,name,stars_y,review_count,label
23821,i0jnz-ODkqqqmcxU-166nw,5,I went here today at 6pm not expecting many pi...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23822,4AFrxm9c1j20ybQ9-stx6w,1,"I really, really, really wanted to like this p...",06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,1
23823,0YCQvW-XjgHiV_nNymlC7A,3,Bought a 9 inch raspberry cream pie.\n\nIt was...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23824,V2OZsbSi9zgTAKpjY5bXSQ,2,"Being a Burnaby resident, I was so excited to ...",06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0
23825,jIZNiaRnLYCCqn-PA2b7lA,5,I have been eating these pies for about 2 year...,06T7cI-8vpENSHmPfm3H8w,The Pie Hole,4.0,109,0


In [6]:
second_df['label'].value_counts()

0    23134
1      692
Name: label, dtype: int64

# Now that the dataframe is created, it can be prepared for machine learning

In [7]:
working_df = second_df[['text','label']]
working_df.head()

Unnamed: 0,text,label
0,Apparently Prides Osteria had a rough summer a...,0
1,This is as close to dining in Italy as you'll ...,0
2,A gem tucked away in Beverly. The atmosphere i...,0
3,"The food is excellent, although the wait for a...",0
4,Meh. Just wasn't comfortable with the minimali...,0


In [8]:
#Remove stop words from the working_df
stop_list = set(stopwords.words("english"))
def modify_text(review, stop_list):
    review = re.sub(r'\n+', ' ', review)
    review = re.sub(r'[^A-Za-z0-9 ]+', '', review)
    return_words = review.split(" ")
    return_list = []
    for i in return_words:

        i = i.lower()
        if not i in stop_list:
            return_list.append(i)
    return " ".join(return_list)
#print(modify_text("This is as close to dining in Italy as you'll",stop_list))
working_df['text'] = working_df['text'].apply(lambda x: modify_text(x,stop_list))
working_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  working_df['text'] = working_df['text'].apply(lambda x: modify_text(x,stop_list))


Unnamed: 0,text,label
0,apparently prides osteria rough summer evidenc...,0
1,close dining italy youll find new englandchef ...,0
2,gem tucked away beverly atmosphere charming se...,0
3,food excellent although wait table took long e...,0
4,meh wasnt comfortable minimalist read cold vib...,0


In [9]:
#
counter = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor= None, stop_words = None, max_features = 5000)

