In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from nltk import download
from nltk import word_tokenize

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from keras import layers


In [3]:
download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df = pd.read_pickle('/content/drive/My Drive/dataset/yelp_2013_balanced.pkl')

In [5]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,text,stars,years
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,344103,It's a shame that my family has been frequenti...,1,2013
1,3495320,My general opinion of writing Yelp reviews is ...,1,2013
1,3500592,made reservations on thursday night for saturd...,1,2013
1,3976123,"It was a night I hope to forget, but I'll stil...",1,2013
1,796941,Trainwreck is appropriately named based on tod...,1,2013
...,...,...,...,...
5,1597611,Had a wonderful brunch with girlfriends today!...,5,2013
5,35945,"Newer is always better, but not in this case. ...",5,2013
5,716138,I love this place! The prices are reasonable a...,5,2013
5,3190993,"I ordered a whole tomato pie, a whole chicken ...",5,2013


In [6]:
data = df['text'].map(word_tokenize).values
total_vocabulary = set(word.lower() for review in data for word in review)  # set created from nested comprehension
print('There are {} unique words in the dataset.'.format(len(total_vocabulary)))
print('There are {} unique tweets in the dataset.'.format(len(data)))


df.head()

There are 33862 unique words in the dataset.
There are 12500 unique tweets in the dataset.


Unnamed: 0_level_0,Unnamed: 1_level_0,text,stars,years
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,344103,It's a shame that my family has been frequenti...,1,2013
1,3495320,My general opinion of writing Yelp reviews is ...,1,2013
1,3500592,made reservations on thursday night for saturd...,1,2013
1,3976123,"It was a night I hope to forget, but I'll stil...",1,2013
1,796941,Trainwreck is appropriately named based on tod...,1,2013


In [7]:
from keras.utils import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D, CuDNNLSTM
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers, optimizers
from keras.preprocessing import text, sequence

In [8]:
# set the emotion/sentiment as our target
target = df['stars']

In [9]:
target

stars         
1      344103     1
       3495320    1
       3500592    1
       3976123    1
       796941     1
                 ..
5      1597611    5
       35945      5
       716138     5
       3190993    5
       767765     5
Name: stars, Length: 12500, dtype: int64

In [10]:
# use one hot encoding since our target is categorical
y = pd.get_dummies(target).values
#y = target


In [11]:
y

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1]], dtype=uint8)

In [12]:
# use keras to create a Tokenizer object
tokenizer = text.Tokenizer(num_words=40000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(df['text']))
tokenized_texts = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(tokenized_texts, maxlen=200)

In [13]:
X

array([[    0,     0,     0, ...,     3,  9139, 16819],
       [   67,   209,   144, ...,  1302,  3053,  3578],
       [   12,   517,     7, ...,    52,  1815,  9140],
       ...,
       [    0,     0,     0, ...,   232,   235,  3131],
       [    0,     0,     0, ...,     5,   152,  1745],
       [    0,     0,     0, ...,   685,     2,   156]], dtype=int32)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create my NN model
model = Sequential()

embedding_size = 128
model.add(Embedding(len(total_vocabulary), embedding_size))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dense(5, activation='softmax'))  # use 5 because we have 5 categories
opt = optimizers.Adam(learning_rate=0.0025)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
model.summary() # check the shape

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         4334336   
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        264192    
 l)                                                              
                                                                 
 lstm (LSTM)                 (None, None, 128)         197120    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 50)                6

In [15]:
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x785b98147640>

In [16]:
from sklearn.metrics import classification_report,accuracy_score
y_pred = model.predict(X_test) # get our predictions
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
acc = accuracy_score(y_test, y_pred)



In [17]:
print(f"accuracy: {acc}")

accuracy: 0.636


In [18]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("rmse:", rmse)

rmse: 0.6752026804976302


In [19]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("mae:", mae)

mae: 0.4248
