In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for part-of-speech tagging
from nltk import pos_tag

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# BeautifulSoup libraray
from bs4 import BeautifulSoup 

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix

#preprocessing scikit
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
 
#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence
from keras.callbacks import ModelCheckpoint
#gensim w2v
#word2vec
from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
df=pd.read_csv('reviews.csv')

In [3]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df=df[['Text','Score']]

In [5]:
df.head()

Unnamed: 0,Text,Score
0,I have bought several of the Vitality canned d...,5
1,Product arrived labeled as Jumbo Salted Peanut...,1
2,This is a confection that has been around a fe...,4
3,If you are looking for the secret ingredient i...,2
4,Great taffy at a great price. There was a wid...,5


In [6]:
df.rename(columns={'Text':'review','Score':'rating'},inplace=True)

In [7]:
#Let's check if there is null information

In [8]:
df.isnull().sum()

review    0
rating    0
dtype: int64

In [9]:
df.drop_duplicates(subset=['rating','review'],keep='first',inplace=True) 

In [10]:
print(df.shape)

(393675, 2)


In [52]:
pd.set_option('display.width', 1000)

In [54]:
df.head(10)

Unnamed: 0,sentiment,clean
0,1,bought several vitality canned dog food produc...
1,0,product arrived labeled jumbo salted peanut pe...
2,1,confection around century light pillowy citrus...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered five pound bag taf...
6,1,saltwater taffy great flavor soft chewy candy ...
7,1,taffy good soft chewy flavor amazing would def...
8,1,right mostly sprouting cat eat grass love rota...
9,1,healthy dog food good digestion also good smal...


In [57]:
for review in df['review'][2:][:3]:
    print(review+'\n'+'\n')

KeyError: 'review'

In [13]:
df['sentiment']=[1 if (x>3) else 0 for x in df['rating']]

In [14]:
df.drop(columns='rating',inplace=True)

In [15]:
df.sample(10)

Unnamed: 0,review,sentiment
259888,My dog LOVES these cookies! She got so bored ...,1
292136,Both my 2 grandsons and I are enjoying this pr...,1
288772,I recieved the product quickly and very good s...,1
551196,After moving to the US from England 22 years a...,1
458796,I've tried various brands of ginger to add to ...,1
422823,Ditto from first review. Love the product and...,0
218694,At first I was concerned that the product was ...,0
89500,These are great juice boxes. First: the Orange...,1
505165,I love this popcorn. It has such a good taste...,1
218275,I used this as an experiment to make potato so...,1


In [16]:
df['sentiment'].value_counts(normalize=True)*100

1    77.937131
0    22.062869
Name: sentiment, dtype: float64

as we can observe the data is skewed towards the positive reviews (with out criteria)

In [17]:
# function to clean and pre-process the text.
def clean_reviews(review):  
    
    # 1. Removing html tags
    review_text = BeautifulSoup(review,"lxml").get_text()
    
    # 2. Retaining only alphabets.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    
    # 3. Converting to lower case and splitting
    word_tokens= review_text.lower().split()
    
    # 4. Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned_review=" ".join(word_tokens)
    return cleaned_review

In [18]:
df['clean']=df['review'].map(clean_reviews)

In [19]:
df.sample(20)

Unnamed: 0,review,sentiment,clean
206449,These are a must for fans of the perc. They wo...,1,must fan perc work perfectly die hard percolat...
302900,"Buy it! Good with ''rice dumplings and ba wan,...",1,buy good rice dumpling ba wan spicy sweet buy ...
467695,I had this for lunch today and loved it. My o...,1,lunch today loved complaint cooking found litt...
419038,Ordered these little jewels for the first time...,1,ordered little jewel first time pleasantly sur...
564092,"I'm sure everyone hears this a lot, but this i...",1,sure everyone hears lot best coffee ever taste...
104754,Do not buy this product! Absolutely DISGUSTIN...,0,buy product absolutely disgusting coffee yes i...
432845,The product was well packed and received in go...,1,product well packed received good time smooth ...
265992,I like this product which was introduced to me...,1,like product introduced hospital drink often t...
31862,Our dog LOVES this! we buy the pumpkin or the ...,1,dog love buy pumpkin butternut squash regular ...
44860,We like this pasta sauce more than other (like...,1,like pasta sauce like prego market sauce thick...


In [20]:
df.drop(columns='review',inplace=True)

In [21]:
df.sample(10)

Unnamed: 0,sentiment,clean
437721,1,find serious negative drink mix sweet sweet ta...
387158,1,pleasantly surprised stuff taste better natura...
340550,0,like sprout brand much say daughter sensitive ...
407326,0,opened package smelled really bad kid even tas...
451250,1,began using nutiva coconut oil recommendation ...
534586,1,lightning fast delivery nice coffee reasonably...
466661,0,good little pricey bought one found much cheap...
276127,1,like smokey sweet molasses based bbq close hea...
530240,1,cannot stop eating thing first bit underwhelme...
420865,1,love green tea wonderful brown rice flavor mak...


In [22]:
import gensim
# # load Google's pre-trained Word2Vec model.
pre_w2v_model = gensim.models.KeyedVectors.load_word2vec_format(r'GoogleNews-vectors-negative300.bin', binary=True)

In [23]:
vocab=pre_w2v_model.wv.vocab


In [24]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=pre_w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))

The no of key-value pairs :  3000000


We need to find the maximum lenght of any document or review in our case. WE will pad all reviews to have this same length.This will be required by Keras embedding layer. Must check this kernel on Kaggle for a wonderful explanation of keras embedding layer

In [25]:
maxi=-1
for i,rev in enumerate(df['clean']):
  tokens=rev.split()
  if(len(tokens)>maxi):
    maxi=len(tokens)
print(maxi)

1956


Now we integer encode the words in the reviews using Keras tokenizer.

Note that there two important variables: which are the vocab_size which is the total no of unique words while the second is 

max_doc_len which is the length of every document after padding. Both of these are required by the Keras embedding layer.

In [26]:
tok = Tokenizer()
tok.fit_on_texts(df['clean'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['clean'])

In [27]:
max_rev_len=1956  # max lenght of a review
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=300 # embedding dimension as choosen in word2vec constructor

In [28]:
# now padding to have a amximum length of 1956
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape   # note that we had 100K reviews and we have padded each review to have  a lenght of 1956 words.

(393675, 1956)

Now we need to pass the w2v word embeddings to the embedding layer in Keras. 

For this we will create the embedding matrix and pass it as 'embedding_initializer' parameter to the layer.

The embedding matrix will be of dimensions (vocab_size,embed_dim) where the word_index of each word from keras tokenizer is its index into the matrix and the corressponding entry is its w2v vector ;)

Note that there may be words which will not be present in embeddings learnt by the w2v model. The embedding matrix entry corressponding to those words will be a vector of all zeros.

*Also note that if u are thinkng why won't a word be present then it is bcoz now we have learnt on out own corpus but if we use pre-trained embedding then it may happen that some words specific to our dataset aren't present then in those cases we may use a fixed vector of zeros to denote all those words that earen;t present in th pre-trained embeddings. 

Also note that it may also happen that some words are not present ifu have filtered some words by setting min_count in w2v constructor. *

In [29]:
# now creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector
  # if word is not found then embed_vector corressponding to that vector will stay zero.

In [34]:
# prepare train and val sets first
Y=keras.utils.to_categorical(df['sentiment'])  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.20,random_state=42)

# Create the Model

In [35]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_rev_len,embeddings_initializer=Constant(embed_matrix)))
# model.add(CuDNNLSTM(64,return_sequences=False)) # loss stucks at about 
model.add(Flatten())
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.50))
# model.add(Dense(16,activation='relu'))
# model.add(Dropout(0.20))
model.add(Dense(2,activation='sigmoid'))  # sigmod for bin. classification.

In [36]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1956, 300)         34912200  
_________________________________________________________________
flatten_1 (Flatten)          (None, 586800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                9388816   
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 44,301,050
Trainable params: 44,301,050
Non-trainable params: 0
_________________________________________________________________


In [37]:
# compile the model
model.compile(optimizer=keras.optimizers.RMSprop(lr=1e-3),loss='binary_crossentropy',metrics=['accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [38]:
# guardo los checkpoints
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [39]:
# specify batch size and epocj=hs for training.
epochs=5
batch_size=64

In [40]:
model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test),callbacks=callbacks_list)

Train on 314940 samples, validate on 78735 samples
Epoch 1/5

Epoch 00001: loss improved from inf to 0.39371, saving model to weights-improvement-01-0.3937.hdf5


MemoryError: 

IndexError: list index out of range

In [None]:
#guardamos el modelo 
!pip install h5py


In [None]:
model.save("model_lstm.h5")
print("model saved")

In [None]:
#preparar un simil pipeline para probar nuevas oraciones

In [49]:
def make_prediction(txt,max_rev_len=1956):
    txt=pd.Series(txt)
    txt=txt.map(clean_reviews)
    encd_rev = tok.texts_to_sequences(txt)
    pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
    return model.predict(pad_rev,verbose=1)
    

In [69]:
txt_test_1='Had dinner with girl friends. Menu is perfect, something for everyone. Service was awesome and Jason was very accommodating. Will be back definitely!'
txt_test_2='Spent 3 nights at this resort earlier this month and it was a great stay! We paid for an upgrade to the oceanfront cabana, which was worth the price. The beach views, calm ocean and ability to watch the sunset on our patio were highlights of the trip! The hotel restaurant was exceptional, not only because every meal we enjoyed had a variety of options and was flavorful but also because the prices were so reasonable for an oceanfront resort! The entire island and resort were a lot more laid back and calm than other touristy coastal front towns in the southeast, but that may in part due to it being early March. I had imagine there is a bit more traffic during high seasons, but either way, we will definitely be back! Highly recommended!'
txt_test_3='The food quality is very very bad had order some soup it was so terrible could eat more than a spoonful. They need to change the chef at the earliest. The service and ambiance is okay.'
txt_test_4='Poor quality service.We had to wait a good 30 minutes before someone noticed us and the restaurant was practically empty at that Time . The food was mediocre too.Never recommending this to anybody.'

In [74]:
print(txt_test_1+'\n'+'\n')
print("----------------------------------------------------------------------------------------------------------------------")
print(make_prediction(txt_test_1))
print(txt_test_2+'\n'+'\n')
print("----------------------------------------------------------------------------------------------------------------------")
print(make_prediction(txt_test_2))
print(txt_test_3+'\n'+'\n')
print("----------------------------------------------------------------------------------------------------------------------")
print(make_prediction(txt_test_3))
print(txt_test_4+'\n'+'\n')
print("----------------------------------------------------------------------------------------------------------------------")
print(make_prediction(txt_test_4))

Had dinner with girl friends. Menu is perfect, something for everyone. Service was awesome and Jason was very accommodating. Will be back definitely!


----------------------------------------------------------------------------------------------------------------------
[[0.00962415 0.98971176]]
Spent 3 nights at this resort earlier this month and it was a great stay! We paid for an upgrade to the oceanfront cabana, which was worth the price. The beach views, calm ocean and ability to watch the sunset on our patio were highlights of the trip! The hotel restaurant was exceptional, not only because every meal we enjoyed had a variety of options and was flavorful but also because the prices were so reasonable for an oceanfront resort! The entire island and resort were a lot more laid back and calm than other touristy coastal front towns in the southeast, but that may in part due to it being early March. I had imagine there is a bit more traffic during high seasons, but either way, we will