## Notebook is aiming to create a list of restaurants that have 5 or more reviews
## Notebook also labels individual reviews that are very low compared to the restaurant's average

In [None]:
#import a bunch of libraries at the very start of the project, will modify as things go on
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from collections import Counter
from nltk import ngrams
from wordcloud import WordCloud
import matplotlib as mpl
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
import json

%matplotlib inline
# Create a flag to use to run read_json or to load the DF from file.
loader = True

analyzer = SentimentIntensityAnalyzer()

In [None]:
# Go through Json file and create output file if flag is False.
if loader == False:
    path_business = r"D:\yelp_data\yelp_academic_dataset_business.json"
    # business_df = pd.read_json(path_business)
    # business_df.head()
    #Load in the Yelp overview of the businesses in the dataset
    data_business = open(path_business)
    read_data_business = data_business.read()
    business_df = pd.read_json(read_data_business, lines = True)
    business_df.head()
    #Create a column to indicate whether or not a business is a restaurant
    #business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if 'Restaurants' in x['categories']  else 0,axis = 1)
    business_df.dropna(inplace=True)
    business_df['restaurant_marker'] = business_df.apply(lambda x: 1 if ("Restaurants" in x['categories'])  else 0,axis = 1)
    business_df = business_df[business_df['restaurant_marker'] == 1]
    business_df.head()
    #Create a set of those businesses that are restaurants
    business_list = set(business_df.business_id.unique())
    print(business_df.shape)
    path_review = r"D:\yelp_data\yelp_academic_dataset_review.json"
    column_names = {"user_id":'',"stars":int(),"text":'',"business_id":''}
    review_df = pd.DataFrame(column_names, index = [])
    review_df["stars"] = review_df["stars"].astype("int8")
    #Load in the reviews dataset, identifying whether or not a review is for a restaurant
    review_holder = pd.read_json(path_review,lines = True, orient ="records",chunksize=100000) #, nrows = 4000000
    for i in review_holder:
        i = i[["user_id","stars","text","business_id"]]
        i.dropna(inplace=True)
        i['restaurant_marker'] = business_df.apply(lambda x: 1 if (x['business_id'] in business_list)  else 0,axis = 1)
        i = i[i['restaurant_marker'] == 1]
        i["stars"] = i["stars"].astype("int8")
        review_df = review_df.append(i, ignore_index = True)
    review_df.shape
    review_df.head()
    #Count the number of reviews left for each restaurant, limit to those with more than 5 reviews
    counts = review_df["business_id"].value_counts()
    counts = counts[counts > 5]
    counts
    review_df['five_plus'] = review_df.apply(lambda x: 1 if (x['business_id'] in counts.index)  else 0,axis = 1)
    #Keep only those restaurants with 5 or more reviews
    review_df = review_df[review_df['five_plus'] == 1]
    review_df.drop(['restaurant_marker', 'five_plus'],axis = 1, inplace = True)
    #Merge the reviews with the business DF
    review_df = pd.merge(review_df,business_df, on = 'business_id')
    #review_df.head()
    #business_df.head()
    #Drop un-needed columns
    review_df.drop(['address', 'city','state','postal_code','latitude','longitude','is_open','attributes','categories','hours','restaurant_marker'],axis = 1, inplace = True)
    review_df.head()
    #Save this review DF
    review_df.to_feather('formatted_reviews.ftr')


In [None]:
# Load previously created file if flag is true
if loader == True:
    second_df = pd.read_feather('formatted_reviews.ftr')

In [None]:
#Label those reviews that are 3 or more stars less than the restaurant's average
second_df['label'] = second_df.apply(lambda x: 1 if x['stars_y'] - x['stars_x'] >= 3 else 0,  axis = 1)
second_df.head()
second_df.shape

In [None]:
second_df.tail()

In [None]:
second_df['label'].value_counts()