In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from nltk import download
from nltk import word_tokenize

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from keras import layers


In [3]:
download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_2014_balanced.pkl')

In [5]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,text,stars,years
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1050652,Awful service. I have never been treated so po...,1,2013
1,3059283,My wife and I chose your restaurant for our 22...,1,2013
1,2527572,"Stupid Starbucks, won't sell the food in the d...",1,2013
1,3974818,"I hate giving any restaurant one star, maybe I...",1,2013
1,3464251,I called in an order and was told they were ve...,1,2013
...,...,...,...,...
5,4035649,I gotta admit I am a Southwest Market newbie. ...,5,2013
5,1503769,A perfect restaurant for those who truly appre...,5,2013
5,2069423,A true GEM in St.Albert!!\n\nWe went there for...,5,2013
5,181240,"The food here is great, the subs don't *quite*...",5,2013


In [6]:
data = df['text'].map(word_tokenize).values
total_vocabulary = set(word.lower() for review in data for word in review)  # set created from nested comprehension
print('There are {} unique words in the dataset.'.format(len(total_vocabulary)))
print('There are {} unique tweets in the dataset.'.format(len(data)))


df.head()

There are 33799 unique words in the dataset.
There are 12500 unique tweets in the dataset.


Unnamed: 0_level_0,Unnamed: 1_level_0,text,stars,years
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1050652,Awful service. I have never been treated so po...,1,2013
1,3059283,My wife and I chose your restaurant for our 22...,1,2013
1,2527572,"Stupid Starbucks, won't sell the food in the d...",1,2013
1,3974818,"I hate giving any restaurant one star, maybe I...",1,2013
1,3464251,I called in an order and was told they were ve...,1,2013


In [7]:
from keras.utils import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D, CuDNNLSTM
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers, optimizers
from keras.preprocessing import text, sequence

In [8]:
# set the emotion/sentiment as our target
target = df['stars']

In [9]:
target

stars         
1      1050652    1
       3059283    1
       2527572    1
       3974818    1
       3464251    1
                 ..
5      4035649    5
       1503769    5
       2069423    5
       181240     5
       3992967    5
Name: stars, Length: 12500, dtype: int64

In [10]:
# use one hot encoding since our target is categorical
y = pd.get_dummies(target).values
#y = target


In [11]:
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]], dtype=uint8)

In [12]:
# use keras to create a Tokenizer object
tokenizer = text.Tokenizer(num_words=40000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(df['text']))
tokenized_texts = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(tokenized_texts, maxlen=200)

In [13]:
X

array([[    0,     0,     0, ...,   405,    39,  1488],
       [ 1099,  1311,     5, ...,    27,   280,   115],
       [    0,     0,     0, ...,  1432,  6853,   389],
       ...,
       [    0,     0,     0, ...,    16,    17,    29],
       [    0,     0,     0, ...,     1,  2979, 16629],
       [    0,     0,     0, ...,    11,   626,   990]], dtype=int32)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create my NN model
model = Sequential()

embedding_size = 128
model.add(Embedding(len(total_vocabulary), embedding_size))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dense(5, activation='softmax'))  # use 5 because we have 5 categories
opt = optimizers.Adam(learning_rate=0.0025)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
model.summary() # check the shape

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         4326272   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        264192    
 l)                                                              
                                                                 
 lstm (LSTM)                 (None, None, 128)         197120    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 50)                6

In [15]:
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7b83880e2110>

In [16]:
from sklearn.metrics import classification_report,accuracy_score
y_pred = model.predict(X_test) # get our predictions
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
acc = accuracy_score(y_test, y_pred)



In [17]:
print(f"accuracy: {acc}")

accuracy: 0.6232


In [18]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("rmse:", rmse)

rmse: 0.6784373932405988


In [19]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("mae:", mae)

mae: 0.425
