# Yelp Rating Prediction Using Tensorflow

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

# imports
import tensorflow as tf
import numpy as np
import sys
import sklearn as sk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer





### Get reviews

In [2]:
df = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=1000) # smaller chunksize helps with memory issues
df = pd.concat(df)
df.head()


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


### Clean reviews

In [3]:
business_review_count = df.groupby('business_id').size().reset_index(name='review_count')         # count reviews for each business
businesses_with_20 = business_review_count[business_review_count['review_count'] >= 20]           # filter businesses with 20 or more reviews
df = pd.merge(df, businesses_with_20, on='business_id', how='inner')                              # merge with reviews to get only reviews for businesses with 20 or more reviews
df = df.drop(['review_id', 'user_id', 'funny', 'cool', 'useful', 'date', 'review_count'], axis=1) # drop unnecessary columns                                                                        # reorder columns
df.head()

Unnamed: 0,business_id,stars,text
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is..."
1,XQfwVwDr-v0ZS3_CbbE5Xw,2,This is the second time we tried turning point...
2,XQfwVwDr-v0ZS3_CbbE5Xw,4,The place is cute and the staff was very frien...
3,XQfwVwDr-v0ZS3_CbbE5Xw,3,We came on a Saturday morning after waiting a ...
4,XQfwVwDr-v0ZS3_CbbE5Xw,2,"Mediocre at best. The decor is very nice, and ..."


### Preprocess reviews and then vectorize

In [4]:
# preprocess text function
def preprocess_text(text):
    text = text.lower()                 # lowercase
    text = re.sub(r'\d+', '', text)     # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\s+', ' ', text)    # remove extra whitespace
    text = text.strip()                 # remove leading/trailing whitespace
    tokens = word_tokenize(text)        # tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words] # remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens] # lemmatize
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df['preprocessed_text'] = df['text'].apply(preprocess_text)                                        # apply preprocess_text function to text column
df.head()

Unnamed: 0,business_id,stars,text,preprocessed_text
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",decide eat aware going take hour beginning end...
1,XQfwVwDr-v0ZS3_CbbE5Xw,2,This is the second time we tried turning point...,second time tried turning point location first...
2,XQfwVwDr-v0ZS3_CbbE5Xw,4,The place is cute and the staff was very frien...,place cute staff friendly nice menu good brunc...
3,XQfwVwDr-v0ZS3_CbbE5Xw,3,We came on a Saturday morning after waiting a ...,came saturday morning waiting month opening ho...
4,XQfwVwDr-v0ZS3_CbbE5Xw,2,"Mediocre at best. The decor is very nice, and ...",mediocre best decor nice like restaurant tryin...


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 3))   # initialize tfidf vectorizer
tfidf_vectors = tfidf_vectorizer.fit_transform(df['preprocessed_text'])                            # fit vectorizer to preprocessed text
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out()) # convert vectors to dataframe
tfidf_df.head()

MemoryError: 

: 

In [None]:
import os

path = "./data/"
filename_write = os.path.join(path, "tfidf_df.csv")
tfidf_df = tfidf_df.reindex(np.random.permutation(tfidf_df.index))
tfidf_df.to_csv(filename_write, index=False)
print("Wrote file to {}".format(filename_write))