# Assignment 3
Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
import pymongo

Using TensorFlow backend.


## Prepare the data 
The yelp data was imported into a collection called 'docs' in a MongoDB database called 'yelp'. We pulled a random sample of 10000.

In [2]:
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['yelp']

In [3]:
num_docs = 80000

In [4]:
yelp_funny = db.docs.aggregate([
    { '$match': {
            'funny': {
                '$exists': True } } }, 
    { '$sample': { 
            'size': num_docs } }, 
    { '$project': {
            '_id': 0, 
            'funny': 1, 
            'text': 1 } }
])

In [5]:
yelp_x = []
yelp_y = []
for review in yelp_funny:
    yelp_x.append(review['text'])
    yelp_y.append(1 if review['funny'] > 0 else 0)

Data is not very balanced. Only about 20% are funny.

In [6]:
num_funny = np.array(yelp_y).sum()
num_funny

16560

Delete some not funny reviews to balance the categories

In [7]:
not_funny_indexes = [i for i, j in enumerate(yelp_y) if j == 0] 

In [8]:
not_funny_to_delete = not_funny_indexes[num_funny:]
len(not_funny_to_delete)

46880

In [9]:
not_funny_to_delete.sort(reverse=True)

In [10]:
for a in not_funny_to_delete:
    del yelp_x[a]
    del yelp_y[a]

In [11]:
print("{} {}".format(len(yelp_x), len(yelp_y)))

33120 33120


Shuffle the lists so that there aren't too many funny ones in test

In [12]:
import random

c = list(zip(yelp_x, yelp_y))
random.shuffle(c)
yelp_x, yelp_y = zip(*c)
yelp_x = list(yelp_x)
yelp_y = list(yelp_y)

In [13]:
from keras.preprocessing.text import Tokenizer

In [14]:
top_words = 10000

In [15]:
t = Tokenizer(top_words)
foo = t.fit_on_texts(yelp_x)

In [16]:
max_length = max([len(s.split()) for s in yelp_x])
max_length

1016

In [17]:
X = t.texts_to_sequences(yelp_x)

In [18]:
X = sequence.pad_sequences(X, maxlen=max_length, padding='post')

In [19]:
split = round(len(X) *.8)
split

26496

In [20]:
X_train = X[:split - 1]
X_test = X[split:]
Y_train = yelp_y[:split -1]
Y_test = yelp_y[split:]

## Part 1b: CNN for sentence classification
Adapted from https://github.com/Theo-/sentiment-analysis-keras-conv/blob/master/train_keras.py and https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [21]:
# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1016, 300)         3000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1016, 64)          57664     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1016, 32)          6176      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1016, 16)          1552      
_________________________________________________________________
flatten_1 (Flatten)          (None, 16256)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16256)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 180)               2926260   
__________

In [22]:
model.fit(X_train, Y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 60.70%


## Part 2: RNN for sentence classification
Adapted from https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [23]:
# LSTM with dropout for sequence classification

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1016, 32)          320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 50.18%
