In [2]:
!pip install -q kaggle
!mkdir /root/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle/’: File exists


In [3]:
!cp /content/drive/My\ Drive/Datasets/kaggle.json /root/.kaggle/

In [4]:
!kaggle datasets download -d snap/amazon-fine-food-reviews

amazon-fine-food-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!unzip amazon-fine-food-reviews.zip

Archive:  amazon-fine-food-reviews.zip
replace Reviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Reviews.csv             
replace database.sqlite? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: database.sqlite         
  inflating: hashes.txt              


# Importing Packages

In [6]:
import nltk
import pandas as pd

# Loading data

In [7]:
data = pd.read_csv('Reviews.csv')

In [8]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# Tokenizing string

In [9]:
text = data['Text']

text.head()

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object

In [10]:
%%time
from nltk.tokenize import word_tokenize

try:
    tokenized_text = text.apply(word_tokenize)
except:
    nltk.download('punkt')
    tokenized_text = text.apply(word_tokenize)

tokenized_text

CPU times: user 6min 34s, sys: 4.86 s, total: 6min 39s
Wall time: 6min 39s


In [11]:
tokenized_text.iloc[0]

['I',
 'have',
 'bought',
 'several',
 'of',
 'the',
 'Vitality',
 'canned',
 'dog',
 'food',
 'products',
 'and',
 'have',
 'found',
 'them',
 'all',
 'to',
 'be',
 'of',
 'good',
 'quality',
 '.',
 'The',
 'product',
 'looks',
 'more',
 'like',
 'a',
 'stew',
 'than',
 'a',
 'processed',
 'meat',
 'and',
 'it',
 'smells',
 'better',
 '.',
 'My',
 'Labrador',
 'is',
 'finicky',
 'and',
 'she',
 'appreciates',
 'this',
 'product',
 'better',
 'than',
 'most',
 '.']

We could have just as easily used $\text{text.iloc[i].split(" ")}$

In [12]:
%%time
from gensim.utils import simple_preprocess

gensim_tokenized = text.apply(simple_preprocess)

gensim_tokenized

CPU times: user 51.6 s, sys: 3.04 s, total: 54.6 s
Wall time: 54.9 s


In [13]:
gensim_tokenized.iloc[0]

['have',
 'bought',
 'several',
 'of',
 'the',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'and',
 'have',
 'found',
 'them',
 'all',
 'to',
 'be',
 'of',
 'good',
 'quality',
 'the',
 'product',
 'looks',
 'more',
 'like',
 'stew',
 'than',
 'processed',
 'meat',
 'and',
 'it',
 'smells',
 'better',
 'my',
 'labrador',
 'is',
 'finicky',
 'and',
 'she',
 'appreciates',
 'this',
 'product',
 'better',
 'than',
 'most']

As you can see, gensim's simple_preprocess() not only tokenizes but also lower cases any upper case characters and removes punctuation marks.

# Removing Stop-words

In [14]:
from nltk.corpus import stopwords

In [15]:
try:
    print( stopwords.words('english'))
except:
    nltk.download('stopwords')
    print( stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
'not' in stopwords.words('english')

True

Note that this also takes out such words like 'not'<br>
which may be crucial for sentiment analysis<br>
or building context.

Likewise there can be such words<br>
which would help in building necessary context for the given task.<br> 

So be wary for that.

In [17]:
stopwords_list = stopwords.words('english')

In [18]:
%%time
func = lambda x: [token for token in x if token not in stopwords_list] 

text_wo_sw = gensim_tokenized.apply(func)

CPU times: user 1min 9s, sys: 566 ms, total: 1min 9s
Wall time: 1min 9s


In [19]:
text_wo_sw.head()

0    [bought, several, vitality, canned, dog, food,...
1    [product, arrived, labeled, jumbo, salted, pea...
2    [confection, around, centuries, light, pillowy...
3    [looking, secret, ingredient, robitussin, beli...
4    [great, taffy, great, price, wide, assortment,...
Name: Text, dtype: object

gensim has a an alternative to nltk's...

In [20]:
from gensim.parsing.preprocessing import remove_stopwords

In [21]:
%%time
func = lambda x: simple_preprocess( remove_stopwords( " ".join(x)))

text_wo_sw_gen = gensim_tokenized.apply(func)

CPU times: user 36.2 s, sys: 2.08 s, total: 38.2 s
Wall time: 38.3 s


So the remove_stopwords() works on untokenized string...<br>
therefore when using this function, apply it before simple_preprocess()..

In [22]:
text_wo_sw_gen.iloc[0]

['bought',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'good',
 'quality',
 'product',
 'looks',
 'like',
 'stew',
 'processed',
 'meat',
 'smells',
 'better',
 'labrador',
 'finicky',
 'appreciates',
 'product',
 'better']

# Stemming or Lemmatization

In [23]:
from nltk.stem.porter import PorterStemmer

In [24]:
%%time
stemmer = PorterStemmer()

func = lambda x: [stemmer.stem(token) for token in x]

stemmed_text = text_wo_sw.apply(func)

CPU times: user 7min 6s, sys: 3.54 s, total: 7min 9s
Wall time: 7min 10s


In [25]:
stemmed_text.head()

0    [bought, sever, vital, can, dog, food, product...
1    [product, arriv, label, jumbo, salt, peanut, p...
2    [confect, around, centuri, light, pillowi, cit...
3    [look, secret, ingredi, robitussin, believ, fo...
4    [great, taffi, great, price, wide, assort, yum...
Name: Text, dtype: object

Although 'vitality' turned to 'vital',<br>
and 'canned' turned to 'can',<br>
there are some words which got messed up:<br>
'taffy' turned to 'taffi', 'arrived' turned to 'arriv'..

So stemming works fine, if not good enough..

In [26]:
from gensim.parsing.porter import PorterStemmer

In [27]:
%%time
func = lambda x: [PorterStemmer().stem(token) for token in x]

stemmed_text_gensim = text_wo_sw.apply(func)

CPU times: user 2min 37s, sys: 4.38 s, total: 2min 41s
Wall time: 2min 42s


In [28]:
stemmed_text_gensim.head()

0    [bought, sever, vital, can, dog, food, product...
1    [product, arriv, label, jumbo, salt, peanut, p...
2    [confect, around, centuri, light, pillowi, cit...
3    [look, secret, ingredi, robitussin, believ, fo...
4    [great, taffi, great, price, wide, assort, yum...
Name: Text, dtype: object

In [39]:
from nltk.stem import WordNetLemmatizer

In [40]:
%%time
lemmatizer = WordNetLemmatizer()

func = lambda x: [lemmatizer.lemmatize(token) for token in x]
try:
    lemma_text = text_wo_sw.apply(func)
except:
    nltk.download('wordnet')
    lemma_text = text_wo_sw.apply(func)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
CPU times: user 1min 35s, sys: 1.14 s, total: 1min 36s
Wall time: 1min 37s


In [41]:
lemma_text.head()

0    [bought, several, vitality, canned, dog, food,...
1    [product, arrived, labeled, jumbo, salted, pea...
2    [confection, around, century, light, pillowy, ...
3    [looking, secret, ingredient, robitussin, beli...
4    [great, taffy, great, price, wide, assortment,...
Name: Text, dtype: object

lemmatizing gives better result than stemming