# Workflow
- Cleaning data
    - Lowercasing
    - Removing punctuation
    - Removing stopwords
- Lemmatizing
- Sentiment Analysis

# Cleaning Data

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("data/reviews_data.csv")
df

Unnamed: 0,review,word count,char count,stopword count
0,Starbelly is always a solid stop for a meal in...,79,421,30
1,Stopped by for a late brunch in the Castro and...,70,441,24
2,"Cutest place, sat outside in the back and the ...",83,447,36
3,My family and I had a great time last Friday n...,47,258,16
4,Starbelly is one of those places that feels li...,280,1529,121
...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29
512,Came here for a lunch in their beautiful back ...,160,898,70
513,Don't even know where to start. I was excited...,187,1004,76
514,"Intimate vibes, great music, and fantastic ser...",141,787,56


## Lowercasing all words

In [10]:
df["review lower"] = df["review"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df

Unnamed: 0,review,word count,char count,stopword count,review lower
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ..."
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...
...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser..."


## Removing Punctuation

In [11]:
df["review nopunc"] = df['review lower'].str.replace("[^\w\s]", "")
df

  df["review nopunc"] = df['review lower'].str.replace("[^\w\s]", "")


Unnamed: 0,review,word count,char count,stopword count,review lower,review nopunc
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...,starbelly is always a solid stop for a meal in...
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...,stopped by for a late brunch in the castro and...
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ...",cutest place sat outside in the back and the h...
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...,my family and i had a great time last friday n...
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...,starbelly is one of those places that feels li...
...,...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...,i used to go to starbelly often before covid a...
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...,came here for a lunch in their beautiful back ...
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...,dont even know where to start i was excited to...
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser...",intimate vibes great music and fantastic servi...


## Removing Stopwords

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [13]:
df["review nostop"] = df["review nopunc"].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
df

Unnamed: 0,review,word count,char count,stopword count,review lower,review nopunc,review nostop
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...,starbelly is always a solid stop for a meal in...,starbelly always solid stop meal castro came d...
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...,stopped by for a late brunch in the castro and...,stopped late brunch castro overall enjoyed exp...
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ...",cutest place sat outside in the back and the h...,cutest place sat outside back heaters abundant...
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...,my family and i had a great time last friday n...,family great time last friday night dinner foo...
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...,starbelly is one of those places that feels li...,starbelly one places feels like home away home...
...,...,...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...,i used to go to starbelly often before covid a...,used go starbelly often covid liked menu howev...
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...,came here for a lunch in their beautiful back ...,came lunch beautiful back patio seated promptl...
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...,dont even know where to start i was excited to...,dont even know start excited come back loving ...
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser...",intimate vibes great music and fantastic servi...,intimate vibes great music fantastic service s...


## Removing some common words

In [24]:
freq = pd.Series(" ".join(df["review nostop"]).split()).value_counts().reset_index()
freq.columns = ["words", "frequencies"]

In [29]:
freq.head(20)

Unnamed: 0,words,frequencies
0,good,408
1,food,384
2,great,290
3,pizza,264
4,chicken,245
5,service,235
6,back,233
7,starbelly,209
8,place,193
9,really,187


In [42]:
other_stopwords = ["one", "came", "would", "us", "got", "get", "go", "im", "try"]
df["review noother"] = df["review nostop"].apply(lambda x: " ".join(x for x in x.split() if x not in other_stopwords))
df

Unnamed: 0,review,word count,char count,stopword count,review lower,review nopunc,review nostop,review noother
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...,starbelly is always a solid stop for a meal in...,starbelly always solid stop meal castro came d...,starbelly always solid stop meal castro dinner...
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...,stopped by for a late brunch in the castro and...,stopped late brunch castro overall enjoyed exp...,stopped late brunch castro overall enjoyed exp...
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ...",cutest place sat outside in the back and the h...,cutest place sat outside back heaters abundant...,cutest place sat outside back heaters abundant...
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...,my family and i had a great time last friday n...,family great time last friday night dinner foo...,family great time last friday night dinner foo...
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...,starbelly is one of those places that feels li...,starbelly one places feels like home away home...,starbelly places feels like home away home int...
...,...,...,...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...,i used to go to starbelly often before covid a...,used go starbelly often covid liked menu howev...,used starbelly often covid liked menu however ...
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...,came here for a lunch in their beautiful back ...,came lunch beautiful back patio seated promptl...,lunch beautiful back patio seated promptly res...
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...,dont even know where to start i was excited to...,dont even know start excited come back loving ...,dont even know start excited come back loving ...
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser...",intimate vibes great music and fantastic servi...,intimate vibes great music fantastic service s...,intimate vibes great music fantastic service s...


## Lemmatizing

In [7]:
from textblob import Word
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amruthaa\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amruthaa\AppData\Roaming\nltk_data...


True

In [43]:
df["cleaned review"] = df["review noother"].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
df

Unnamed: 0,review,word count,char count,stopword count,review lower,review nopunc,review nostop,review noother,cleaned review
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...,starbelly is always a solid stop for a meal in...,starbelly always solid stop meal castro came d...,starbelly always solid stop meal castro dinner...,starbelly always solid stop meal castro dinner...
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...,stopped by for a late brunch in the castro and...,stopped late brunch castro overall enjoyed exp...,stopped late brunch castro overall enjoyed exp...,stopped late brunch castro overall enjoyed exp...
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ...",cutest place sat outside in the back and the h...,cutest place sat outside back heaters abundant...,cutest place sat outside back heaters abundant...,cutest place sat outside back heater abundant ...
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...,my family and i had a great time last friday n...,family great time last friday night dinner foo...,family great time last friday night dinner foo...,family great time last friday night dinner foo...
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...,starbelly is one of those places that feels li...,starbelly one places feels like home away home...,starbelly places feels like home away home int...,starbelly place feel like home away home inter...
...,...,...,...,...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...,i used to go to starbelly often before covid a...,used go starbelly often covid liked menu howev...,used starbelly often covid liked menu however ...,used starbelly often covid liked menu however ...
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...,came here for a lunch in their beautiful back ...,came lunch beautiful back patio seated promptl...,lunch beautiful back patio seated promptly res...,lunch beautiful back patio seated promptly res...
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...,dont even know where to start i was excited to...,dont even know start excited come back loving ...,dont even know start excited come back loving ...,dont even know start excited come back loving ...
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser...",intimate vibes great music and fantastic servi...,intimate vibes great music fantastic service s...,intimate vibes great music fantastic service s...,intimate vibe great music fantastic service se...


In [44]:
print(df["review"].iloc[0])
print()
print(df["cleaned review"].iloc[0])

Starbelly is always a solid stop for a meal in the Castro. I came here for dinner during Pride weekend and it was busy but still not a long wait to get a table. The cocktails were strong, the food was flavorful and fresh. We got the thai spiced pork sausage (tasty bites), the white bean puré (creamy!), and the spahgetti (solid) along with the steak skewer (a great bite) Will definitely be coming back on my next visit.

starbelly always solid stop meal castro dinner pride weekend busy still long wait table cocktail strong food flavorful fresh thai spiced pork sausage tasty bite white bean puré creamy spahgetti solid along steak skewer great bite definitely coming back next visit


# Sentiment Analysis
textblob allows us to measure **polarity and subjectivity**

In [45]:
from textblob import TextBlob

In [47]:
df["polarity"] = df["cleaned review"].apply(lambda x: TextBlob(x).sentiment[0])
df["subjectivity"] = df["cleaned review"].apply(lambda x: TextBlob(x).sentiment[1])

In [48]:
df

Unnamed: 0,review,word count,char count,stopword count,review lower,review nopunc,review nostop,review noother,cleaned review,polarity,subjectivity
0,Starbelly is always a solid stop for a meal in...,79,421,30,starbelly is always a solid stop for a meal in...,starbelly is always a solid stop for a meal in...,starbelly always solid stop meal castro came d...,starbelly always solid stop meal castro dinner...,starbelly always solid stop meal castro dinner...,0.143939,0.307576
1,Stopped by for a late brunch in the Castro and...,70,441,24,stopped by for a late brunch in the castro and...,stopped by for a late brunch in the castro and...,stopped late brunch castro overall enjoyed exp...,stopped late brunch castro overall enjoyed exp...,stopped late brunch castro overall enjoyed exp...,0.237500,0.591667
2,"Cutest place, sat outside in the back and the ...",83,447,36,"cutest place, sat outside in the back and the ...",cutest place sat outside in the back and the h...,cutest place sat outside back heaters abundant...,cutest place sat outside back heaters abundant...,cutest place sat outside back heater abundant ...,0.102198,0.531868
3,My family and I had a great time last Friday n...,47,258,16,my family and i had a great time last friday n...,my family and i had a great time last friday n...,family great time last friday night dinner foo...,family great time last friday night dinner foo...,family great time last friday night dinner foo...,0.350000,0.511667
4,Starbelly is one of those places that feels li...,280,1529,121,starbelly is one of those places that feels li...,starbelly is one of those places that feels li...,starbelly one places feels like home away home...,starbelly places feels like home away home int...,starbelly place feel like home away home inter...,0.365206,0.711651
...,...,...,...,...,...,...,...,...,...,...,...
511,I used to go to Starbelly often before COVID A...,73,409,29,i used to go to starbelly often before covid a...,i used to go to starbelly often before covid a...,used go starbelly often covid liked menu howev...,used starbelly often covid liked menu however ...,used starbelly often covid liked menu however ...,-0.165909,0.738636
512,Came here for a lunch in their beautiful back ...,160,898,70,came here for a lunch in their beautiful back ...,came here for a lunch in their beautiful back ...,came lunch beautiful back patio seated promptl...,lunch beautiful back patio seated promptly res...,lunch beautiful back patio seated promptly res...,0.483073,0.640625
513,Don't even know where to start. I was excited...,187,1004,76,don't even know where to start. i was excited ...,dont even know where to start i was excited to...,dont even know start excited come back loving ...,dont even know start excited come back loving ...,dont even know start excited come back loving ...,0.092130,0.441204
514,"Intimate vibes, great music, and fantastic ser...",141,787,56,"intimate vibes, great music, and fantastic ser...",intimate vibes great music and fantastic servi...,intimate vibes great music fantastic service s...,intimate vibes great music fantastic service s...,intimate vibe great music fantastic service se...,0.359014,0.632937


In [51]:
df[["review", "polarity", "subjectivity"]]

Unnamed: 0,review,polarity,subjectivity
0,Starbelly is always a solid stop for a meal in...,0.143939,0.307576
1,Stopped by for a late brunch in the Castro and...,0.237500,0.591667
2,"Cutest place, sat outside in the back and the ...",0.102198,0.531868
3,My family and I had a great time last Friday n...,0.350000,0.511667
4,Starbelly is one of those places that feels li...,0.365206,0.711651
...,...,...,...
511,I used to go to Starbelly often before COVID A...,-0.165909,0.738636
512,Came here for a lunch in their beautiful back ...,0.483073,0.640625
513,Don't even know where to start. I was excited...,0.092130,0.441204
514,"Intimate vibes, great music, and fantastic ser...",0.359014,0.632937


In [62]:
# a review with lowest polarity
df[df["polarity"]==-1.0].iloc[0]["review"]

'i saw 2 bugs in my fries and a piece of hair.. very disgusting'

In [64]:
# a review with highest polarity
df[df["polarity"]==1.0].iloc[0]["review"]

'Excellent service, delicious food and one of the best non-alcoholic beverages I have ever had out. The waitress was so helpful and delightful.'