## Sentimemtal Analysis using Restaurant Reviews

#### Python Imports

In [21]:
import nltk
import pandas as pd
from collections import defaultdict
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
%reload_ext watermark

In [3]:
%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.7.0

pandas: 1.5.2
nltk  : 3.8.1



In [4]:
nltk.data.path.append("./nltk_data")

#### Read the tab-separated Restaurant Reviews file

File can be downloaded from: https://www.kaggle.com/datasets/d4rklucif3r/restaurant-reviews

In [5]:
reviews_df = pd.read_csv('./txt_data/Restaurant_Reviews.tsv', sep='\t')
reviews_df.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


#### Display information about the reviews dataframe

In [6]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


#### Extract the reviews text as a list

In [7]:
reviews_txt = reviews_df.Review.values.tolist()
reviews_txt

['Wow... Loved this place.',
 'Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.',
 'Service was very prompt.',
 'Would not go back.',
 'The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.',
 'I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!',
 'I was disgusted because I was pretty sure that was human hair.',
 'I was shocked because no signs indicate cash only.',
 'Highly recommended.',
 'Waitress was a little slow in service.',
 'This place is not worth your time, let alone Vegas.',
 'did not like at all.

#### Create an instance of a word tokenizer

In [8]:
word_tokenizer = WordPunctTokenizer()

#### Create an instance of a word Lemmatizer

In [9]:
word_lemmatizer = nltk.WordNetLemmatizer()

#### Clean the reviews by removing punctuations, two letter words, and converting to root words

In [10]:
vocabulary_dict = defaultdict(int)
cleansed_review_txt = []
for review in reviews_txt:
    tokens = word_tokenizer.tokenize(review)
    alpha_words = [word.lower() for word in tokens if word.isalpha() and len(word) > 2]
    final_words = [word_lemmatizer.lemmatize(word) for word in alpha_words]
    for word in final_words:
        vocabulary_dict[word] += 1
    cleansed_review = ' '.join(final_words)
    cleansed_review_txt.append(cleansed_review)
cleansed_review_txt

['wow loved this place',
 'crust not good',
 'not tasty and the texture wa just nasty',
 'stopped during the late may bank holiday off rick steve recommendation and loved',
 'the selection the menu wa great and were the price',
 'now getting angry and want damn pho',
 'honeslty didn taste that fresh',
 'the potato were like rubber and you could tell they had been made ahead time being kept under warmer',
 'the fry were great too',
 'great touch',
 'service wa very prompt',
 'would not back',
 'the cashier had care what ever what had say still ended being wayyy overpriced',
 'tried the cape cod ravoli chicken with cranberry mmmm',
 'wa disgusted because wa pretty sure that wa human hair',
 'wa shocked because sign indicate cash only',
 'highly recommended',
 'waitress wa little slow service',
 'this place not worth your time let alone vega',
 'did not like all',
 'the burrittos blah',
 'the food amazing',
 'service also cute',
 'could care le the interior just beautiful',
 'they perform

In [11]:
reviews_df['Review'] = cleansed_review_txt
reviews_df.head(10)

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust not good,0
2,not tasty and the texture wa just nasty,0
3,stopped during the late may bank holiday off r...,1
4,the selection the menu wa great and were the p...,1
5,now getting angry and want damn pho,0
6,honeslty didn taste that fresh,0
7,the potato were like rubber and you could tell...,0
8,the fry were great too,1
9,great touch,1


#### Number of unique words in reviews corpus (before filtering)

In [12]:
len(vocabulary_dict)

1851

#### Display the Restaurant Reviews vocabulary that have at least 3 occurrences

In [13]:
vocabulary = []
sorted_vocabulary = sorted(vocabulary_dict.items(), key=lambda kv: kv[1], reverse=True)
for word, count in sorted_vocabulary:
    if count > 2:
        vocabulary.append(word)
vocabulary

['the',
 'and',
 'wa',
 'this',
 'food',
 'not',
 'place',
 'for',
 'good',
 'service',
 'very',
 'with',
 'great',
 'had',
 'that',
 'you',
 'were',
 'are',
 'but',
 'have',
 'back',
 'they',
 'here',
 'time',
 'like',
 'all',
 'our',
 'will',
 'there',
 'really',
 'just',
 'their',
 'best',
 'would',
 'ever',
 'restaurant',
 'also',
 'friendly',
 'one',
 'only',
 'never',
 'don',
 'your',
 'out',
 'nice',
 'been',
 'what',
 'amazing',
 'again',
 'can',
 'from',
 'delicious',
 'vega',
 'did',
 'which',
 'pretty',
 'get',
 'some',
 'came',
 'when',
 'love',
 'experience',
 'staff',
 'eat',
 'minute',
 'definitely',
 'even',
 'disappointed',
 'won',
 'chicken',
 'server',
 'bad',
 'much',
 'star',
 'got',
 'going',
 'steak',
 'pizza',
 'price',
 'made',
 'say',
 'salad',
 'first',
 'more',
 'dish',
 'about',
 'menu',
 'could',
 'burger',
 'think',
 'way',
 'better',
 'too',
 'ha',
 'worst',
 'always',
 'than',
 'well',
 'how',
 'want',
 'fresh',
 'being',
 'because',
 'table',
 'quality

#### Display the final count of vocabulary in the reviews corpus (post filtering)

In [14]:
len(vocabulary)

541

#### Create an instance of the TF-IDF word vectorizer

In [15]:
word_vectorizer = TfidfVectorizer(vocabulary=vocabulary)

#### Create a TF-IDF word vector for Restaurant Reviews vocabulary

In [22]:
matrix = word_vectorizer.fit_transform(reviews_df['Review']).toarray()
reviews_vector_df = pd.DataFrame(data=matrix, columns=vocabulary)
reviews_vector_df['_pos_neg'] = reviews_df['Liked']
reviews_vector_df.head(10)

Unnamed: 0,the,and,wa,this,food,not,place,for,good,service,...,him,edible,seating,work,crowd,style,salsa,awful,lacked,_pos_neg
0,0.0,0.0,0.0,0.312551,0.0,0.0,0.335772,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.684638,0.0,0.0,0.728883,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.143707,0.167243,0.187134,0.0,0.0,0.254375,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.131507,0.153045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.443533,0.172058,0.192522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,0.0,0.161452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.09667,0.112503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.191076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


#### Create the training and test data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(reviews_vector_df, reviews_vector_df['_pos_neg'], test_size=0.25, random_state=101)
X_train = X_train.drop('_pos_neg', axis=1)
X_test = X_test.drop('_pos_neg', axis=1)

#### Initialize various Machine Learning models

In [18]:
model_names = ['Logistic Regression', 'Multinomial Naive Bayes', 'Random Forest']
model_instances = [LogisticRegression(), MultinomialNB(), RandomForestClassifier()]
ml_models = zip(model_names, model_instances)

#### Train, test, and score the Machine Learning Models

In [19]:
for name, model in ml_models:
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    score = accuracy_score(y_test, y_predict)
    print(f'Model: {name}, Accuracy: {score}')

Model: Logistic Regression, Accuracy: 0.832
Model: Multinomial Naive Bayes, Accuracy: 0.8
Model: Random Forest, Accuracy: 0.74
