In [1]:
# Import modules
import pandas as pd
import numpy as np
import nltk
import re

## Load our data

In [2]:
dataset = pd.read_csv('../raw_data/Reviews.csv')

In [3]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Selecting only the columns that contains the text

In [4]:
data = dataset[["Text"]].copy()

In [5]:
data.head(5)

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...


In [6]:
data.isnull().sum()

Text    0
dtype: int64

In [7]:
data.shape

(568454, 1)

In [8]:
data = data.drop_duplicates()

## Creating cleaning function

In [9]:
data.sample(5)

Unnamed: 0,Text
532824,Just received my Lobster Corn Chowder. Was ver...
273637,never had smoked pepper before- it will be use...
172018,The only thing I don't like is having to buy t...
389366,My little Cavalier has just finished eating th...
126409,I haven't done a side by side taste test but I...


In [10]:
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

In [13]:
def clean_reviews(review):
    cleantext = BeautifulSoup(review, "lxml").text
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",cleantext)
    # 3. Converting to lower case and splittingç
    word_tokens= review_text.lower().split()
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [14]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0
for review in data['Text']:
  sents =tokenizer.tokenize(review.strip())
  sum+=len(sents)
  for sent in sents:
    cleaned_sent=clean_reviews(sent)
    sentences.append(cleaned_sent) # can use word_tokenize also.
print(sum)
print(len(sentences))  # total no of sentences



1955249
1955249


In [18]:
clean_sentence = []
for sentence in sentences:
    clean_sentence.append(sentence.split(" "))
clean_sentence

[['bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'product',
  'found',
  'good',
  'quality'],
 ['product', 'look', 'like', 'stew', 'processed', 'meat', 'smell', 'better'],
 ['labrador', 'finicky', 'appreciates', 'product', 'better'],
 ['product',
  'arrived',
  'labeled',
  'jumbo',
  'salted',
  'peanut',
  'peanut',
  'actually',
  'small',
  'sized',
  'unsalted'],
 ['sure', 'error', 'vendor', 'intended', 'represent', 'product', 'jumbo'],
 ['confection', 'around', 'century'],
 ['light', 'pillowy', 'citrus', 'gelatin', 'nut', 'case', 'filbert'],
 ['cut', 'tiny', 'square', 'liberally', 'coated', 'powdered', 'sugar'],
 ['tiny', 'mouthful', 'heaven'],
 ['chewy', 'flavorful'],
 ['highly', 'recommend', 'yummy', 'treat'],
 ['familiar', 'story', 'c'],
 ['lewis',
  'lion',
  'witch',
  'wardrobe',
  'treat',
  'seduces',
  'edmund',
  'selling',
  'brother',
  'sister',
  'witch'],
 ['looking', 'secret', 'ingredient', 'robitussin', 'believe', 'found'],
 ['got',
  'add

In [26]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import pickle

In [27]:
food_bigram_model = Phrases(clean_sentence, min_count=100)
food_bigrams = [food_bigram_model[sent] for sent in clean_sentence]
food_trigram_model = Phrases(food_bigrams, min_count=50)
phrased_food_sentences = [food_trigram_model[sent] for sent in food_bigrams]

In [31]:
with open("b_list_food_preprocessed.txt", "bw") as fp:   #Pickling
    pickle.dump(phrased_food_sentences, fp)