In [1]:
from collections import Counter
import re
import string

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_colwidth", None)

## Loading Data

In [3]:
df = pd.read_csv("../data/intermediate/food_reviews.csv")

## Casing
Tradeoffs -
* Difference in sentiment
* Lesser time & compute

In [4]:
df["Summary"] = df["Summary"].str.lower()

In [5]:
df["Text"] = df["Text"].str.lower()

In [6]:
df[["Text", "Summary"]].head()

Unnamed: 0,Text,Summary
0,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,good quality dog food
1,"product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".",not as advertised
2,"this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.","""delight"" says it all"
3,if you are looking for the secret ingredient in robitussin i believe i have found it. i got this in addition to the root beer extract i ordered (which was good) and made some cherry soda. the flavor is very medicinal.,cough medicine
4,"great taffy at a great price. there was a wide assortment of yummy taffy. delivery was very quick. if your a taffy lover, this is a deal.",great taffy


## Puncutations Removal

In [7]:
PUNCT_TO_REMOVE = '"#$%&\'*+,-./<=>?@[\\]^_`{|}~'

In [8]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [9]:
df["Summary"] = df["Summary"].apply(lambda text: remove_punctuation(text))

In [10]:
df["Text"] = df["Text"].apply(lambda text: remove_punctuation(text))

In [11]:
df[["Text", "Summary"]].head()

Unnamed: 0,Text,Summary
0,i have bought several of the vitality canned dog food products and have found them all to be of good quality the product looks more like a stew than a processed meat and it smells better my labrador is finicky and she appreciates this product better than most,good quality dog food
1,product arrived labeled as jumbo salted peanutsthe peanuts were actually small sized unsalted not sure if this was an error or if the vendor intended to represent the product as jumbo,not as advertised
2,this is a confection that has been around a few centuries it is a light pillowy citrus gelatin with nuts in this case filberts and it is cut into tiny squares and then liberally coated with powdered sugar and it is a tiny mouthful of heaven not too chewy and very flavorful i highly recommend this yummy treat if you are familiar with the story of cs lewis the lion the witch and the wardrobe this is the treat that seduces edmund into selling out his brother and sisters to the witch,delight says it all
3,if you are looking for the secret ingredient in robitussin i believe i have found it i got this in addition to the root beer extract i ordered (which was good) and made some cherry soda the flavor is very medicinal,cough medicine
4,great taffy at a great price there was a wide assortment of yummy taffy delivery was very quick if your a taffy lover this is a deal,great taffy


## Removal of Stopwords

In [12]:
STOPWORDS = set(stopwords.words('english'))

In [13]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [14]:
df["Summary"] = df["Summary"].apply(lambda text: remove_stopwords(text))

In [15]:
df["Text"] = df["Text"].apply(lambda text: remove_stopwords(text))

In [16]:
df[["Text", "Summary"]].head()

Unnamed: 0,Text,Summary
0,bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better,good quality dog food
1,product arrived labeled jumbo salted peanutsthe peanuts actually small sized unsalted sure error vendor intended represent product jumbo,advertised
2,confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story cs lewis lion witch wardrobe treat seduces edmund selling brother sisters witch,delight says
3,looking secret ingredient robitussin believe found got addition root beer extract ordered (which good) made cherry soda flavor medicinal,cough medicine
4,great taffy great price wide assortment yummy taffy delivery quick taffy lover deal,great taffy


## Remove Frequent Words
Remove some frequent words which are of not so much importance to us. We will perform this operation only for Text field & not for Summary. Check out unigram analysis for summary to understand the reason

In [17]:
cnt = Counter()
for text in tqdm(df["Text"].values):
    for word in text.split():
        cnt[word] += 1

  0%|          | 0/395003 [00:00<?, ?it/s]

In [18]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

In [19]:
cnt.most_common(20)

[('br', 180959),
 ('like', 168900),
 ('good', 131203),
 ('taste', 114701),
 ('one', 114093),
 ('great', 107631),
 ('product', 102439),
 ('flavor', 94857),
 ('coffee', 93615),
 ('tea', 88643),
 ('would', 84407),
 ('love', 83301),
 ('get', 73745),
 ('really', 68907),
 ('food', 65267),
 ('dont', 64795),
 ('much', 63132),
 ('use', 61472),
 ('also', 58715),
 ('little', 57437)]

In [20]:
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

In [21]:
df["Text"] = df["Text"].apply(lambda text: remove_freqwords(text))

**On second thoughts, we might try different versions with & without removing frequent words**

## Remove Rare Words
Same as above but for the other end of the spectrum

In [22]:
cnt.most_common()[:-15-1:-1]

[('atmy', 1),
 ('contamination!!', 1),
 ('lofted', 1),
 ('daintiesbr', 1),
 ('cornbelt', 1),
 ('hrefhttp:wwwamazoncomgpproductb004by23i8pacific', 1),
 ('grasssaver', 1),
 ('ripoff!!!!!', 1),
 ('1320z', 1),
 ('fowlers!!', 1),
 ('mixbuy', 1),
 ('smoothheavy', 1),
 ('distinctivelydelicious', 1),
 ('howeveronce', 1),
 ('brevard', 1)]

In [23]:
n_rare_words = 200 # tune this
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])

In [24]:
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

In [25]:
df["Text"] = df["Text"].apply(lambda text: remove_rarewords(text))

In [26]:
df[["Text", "Summary"]].head()

Unnamed: 0,Text,Summary
0,bought several vitality canned dog food products found quality looks stew processed meat smells better labrador finicky appreciates better,good quality dog food
1,arrived labeled jumbo salted peanutsthe peanuts actually small sized unsalted sure error vendor intended represent jumbo,advertised
2,confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story cs lewis lion witch wardrobe treat seduces edmund selling brother sisters witch,delight says
3,looking secret ingredient robitussin believe found got addition root beer extract ordered (which good) made cherry soda medicinal,cough medicine
4,taffy price wide assortment yummy taffy delivery quick taffy lover deal,great taffy


## Lemmatization
Lemmatization is similar to stemming in reducing inflected words to their word stem but differs in the way that it makes sure the root word (also called as lemma) belongs to the language.

In [27]:
lemmatizer = WordNetLemmatizer()

In [29]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [30]:
df["Summary"] = df["Summary"].apply(lambda text: lemmatize_words(text))

In [31]:
df["Text"] = df["Text"].apply(lambda text: lemmatize_words(text))

In [32]:
df[["Text", "Summary"]].head()

Unnamed: 0,Text,Summary
0,bought several vitality canned dog food product found quality look stew processed meat smell better labrador finicky appreciates better,good quality dog food
1,arrived labeled jumbo salted peanutsthe peanut actually small sized unsalted sure error vendor intended represent jumbo,advertised
2,confection around century light pillowy citrus gelatin nut case filbert cut tiny square liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe treat seduces edmund selling brother sister witch,delight say
3,looking secret ingredient robitussin believe found got addition root beer extract ordered (which good) made cherry soda medicinal,cough medicine
4,taffy price wide assortment yummy taffy delivery quick taffy lover deal,great taffy


**Takes too long to run**

## Removal of URLs
There are few URLs in the dataset like these - `hrefhttp:wwwamazoncomgpproductb004by23i8pacific`. They don't add any value so let's remove them.

Summary won't contain any URLs as the length of summaries are short (as observed in EDA)

In [33]:
url_pattern = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')

In [34]:
def remove_urls(text):
    return url_pattern.sub(r'', text)

In [35]:
df["Text"] = df["Text"].apply(lambda text: remove_urls(text))

**Takes too long to run**

## Converting Scores to Class

Converting scores 1-5 to binary classes - Positive & Negative
* Score 1-3: Negative
* Score 4,5: Positive

In [None]:
df.loc[df["Score"].isin([1, 2, 3]), "Class"] = "Positive"
df.loc[df["Score"].isin([4, 5]), "Class"] = "Positive"