# <u>Yelp Rating Prediction Using Tensorflow</u>

## **Data Cleaning**

### *Libraries*

In [1]:
import numpy as np
import pandas as pd
import os
import sklearn.feature_extraction.text as sk_text
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### *Get reviews*

In [2]:
df = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=1000) # smaller chunksize helps with memory issues
df = pd.concat(df)
print(df.head())

                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0      3       0      0     0   
1      5       1      0     1   
2      3       0      0     0   
3      5       1      0     1   
4      4       1      0     1   

                                                text                date  
0  If you decide to eat here, just be aware it is... 2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year... 2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delicious.

### *Clean reviews*

In [3]:
# Convert all missing values and zeroes in specified column to median -> from labs with slight modifications
def missing_median(df, name):
    non_zero_values = df[name][df[name] != 0]
    med = non_zero_values.median()
    df[name] = df[name].replace(0, med)
    df[name] = df[name].fillna(med)

business_review_count = df.groupby('business_id').size().reset_index(name='review_count')                        # count reviews for each business
businesses_with_20 = business_review_count[business_review_count['review_count'] >= 20]                          # filter businesses with 20 or more reviews
df = pd.merge(df, businesses_with_20, on='business_id', how='inner')                                             # merge with reviews to get only reviews for businesses with 20 or more reviews
df = df.drop(['business_id', 'review_id', 'user_id', 'funny', 'cool', 'useful', 'date', 'review_count'], axis=1) # drop unnecessary columns
missing_median(df, 'stars')                                                                                      # in case there are any reviews with 0 or missing stars, replace with median
print(df.head())

   stars                                               text
0      3  If you decide to eat here, just be aware it is...
1      2  This is the second time we tried turning point...
2      4  The place is cute and the staff was very frien...
3      3  We came on a Saturday morning after waiting a ...
4      2  Mediocre at best. The decor is very nice, and ...


### *Preprocess reviews*

In [4]:
# preprocess text function -> if you get nltk error:  open Anaconda prompt -> ipython -> import nltk -> nltk.download('popular') OR command line -> python -m nltk.downloader popular
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()                 # lowercase
    text = re.sub(r'\d+', '', text)     # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\s+', ' ', text)    # remove extra whitespace
    text = text.strip()                 # remove leading/trailing whitespace
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]          # remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens] # lemmatize
    filtered_text = ' '.join(lemmatized_tokens)                                    # join back into text
    return filtered_text

df['preprocessed_text'] = df['text'].apply(preprocess_text) # apply preprocess_text function to text column
df = df.drop('text', axis=1)                                # drop original text column
print(df.head())

   stars                                  preprocessed_text
0      3  decide eat aware going take hour beginning end...
1      2  second time tried turning point location first...
2      4  place cute staff friendly nice menu good brunc...
3      3  came saturday morning waiting month opening ho...
4      2  mediocre best decor nice like restaurant tryin...


### *Save preprocessed dataframe*

In [5]:
path = "./data/"
filename_write = os.path.join(path, "df_preprocessed.csv")
df.to_csv(filename_write, index=False, encoding='utf-8') # using default encoding also worked -> used for next cell but wasn't helpful
print("Wrote file to {}".format(filename_write))

Wrote file to ./data/df_preprocessed.csv


### *Optional start point*

In [None]:
# Havent figured this out yet.  I tried loading the csv file but I get an error in the next cell.  Tried different encoding but didn't work either.  Will try again later maybe.
# Would be a nice starting point for the next step of the project because preprocessing take awhile.
'''path = "./data/"
filename_read = os.path.join(path, "df_preprocessed.csv")
df = pd.read_csv(filename_read)'''

### *Vectorize Reviews*

In [7]:
vectorizer = sk_text.TfidfVectorizer(max_features=400, dtype=np.float32) # can adjust max_features if encounter memory issues; dtype to reduce memory usage -> defaults to float64
corpus = df['preprocessed_text']                                         # put preprocessed text into corpus
matrix = vectorizer.fit_transform(corpus)                                # fit and transform the corpus
tfidf_data = matrix.toarray()                                            # convert matrix to array
print('shape:', tfidf_data.shape)
print(tfidf_data)


shape: (6146631, 400)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### *Feature names*

In [8]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['able' 'absolutely' 'actually' 'almost' 'also' 'always' 'amazing'
 'amount' 'another' 'anyone' 'anything' 'appetizer' 'area' 'around'
 'arrived' 'ask' 'asked' 'atmosphere' 'attentive' 'away' 'awesome' 'back'
 'bacon' 'bad' 'bar' 'bartender' 'bbq' 'bean' 'beautiful' 'beef' 'beer'
 'best' 'better' 'big' 'bit' 'bite' 'bowl' 'bread' 'breakfast' 'bring'
 'brought' 'brunch' 'burger' 'business' 'busy' 'cake' 'call' 'called'
 'came' 'cant' 'car' 'care' 'check' 'cheese' 'chicken' 'chip' 'chocolate'
 'choice' 'city' 'clean' 'close' 'cocktail' 'coffee' 'cold' 'come'
 'coming' 'cooked' 'cool' 'could' 'couldnt' 'couple' 'course' 'crab'
 'cream' 'customer' 'cut' 'day' 'deal' 'decent' 'decided' 'definitely'
 'delicious' 'dessert' 'didnt' 'different' 'dining' 'dinner'
 'disappointed' 'dish' 'doesnt' 'dog' 'done' 'dont' 'door' 'drink' 'eat'
 'eating' 'egg' 'either' 'else' 'employee' 'end' 'enjoy' 'enjoyed'
 'enough' 'entree' 'especially' 'even' 'ever' 'every' 'everyone'
 'everything' 'excellent' 'expe

### *Concatenate stars and matrix into new dataframe*

In [9]:
assert len(df) == tfidf_data.shape[0], "Number of rows in dataframe does not match number of rows in matrix." # check number of rows in dataframe equals number of rows in tfidf matrix
df_data = pd.concat([df[['stars']], pd.DataFrame(tfidf_data)], axis=1)                                        # concatenate stars column with tfidf matrix
print(df_data.head())

   stars    0    1    2    3    4    5    6    7         8  ...  390  391  \
0      3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.168147  ...  0.0  0.0   
1      2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.0  0.0   
2      4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.207039  ...  0.0  0.0   
3      3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.128253  ...  0.0  0.0   
4      2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...  0.0  0.0   

   392       393       394  395       396  397  398  399  
0  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
1  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
2  0.0  0.000000  0.000000  0.0  0.000000  0.0  0.0  0.0  
3  0.0  0.000000  0.090665  0.0  0.000000  0.0  0.0  0.0  
4  0.0  0.117374  0.000000  0.0  0.139791  0.0  0.0  0.0  

[5 rows x 401 columns]


### *Add featured names into dataframe*

In [10]:
df_data.columns = ['stars'] + feature_names.tolist()
print(df_data.head())

   stars  able  absolutely  actually  almost  also  always  amazing  amount  \
0      3   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
1      2   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
2      4   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
3      3   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   
4      2   0.0         0.0       0.0     0.0   0.0     0.0      0.0     0.0   

    another  ...  wont  work  working     worth     would  wouldnt     wrong  \
0  0.168147  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
1  0.000000  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
2  0.207039  ...   0.0   0.0      0.0  0.000000  0.000000      0.0  0.000000   
3  0.128253  ...   0.0   0.0      0.0  0.000000  0.090665      0.0  0.000000   
4  0.000000  ...   0.0   0.0      0.0  0.117374  0.000000      0.0  0.139791   

   year  yet  youre  
0   0.0  0.0    0.0  


### *Save dataframe*

In [11]:
filename_write = os.path.join(path, "df_data.csv")
df_data.to_csv(filename_write, index=False)
print("Wrote file to {}".format(filename_write))

Wrote file to ./data/df_data.csv
