# Assignment 3
Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
import pymongo

Using TensorFlow backend.


## Prepare the data 
The yelp data was imported into a collection called 'reviews' in a MongoDB database called 'yelp'. We pulled a random sample of 80000.

In [2]:
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['yelp']

In [3]:
num_docs = 80000

In [4]:
yelp_funny = db.reviews.aggregate([
    { '$match': {
            'funny': {
                '$exists': True } } }, 
    { '$sample': { 
            'size': num_docs } }, 
    { '$project': {
            '_id': 0, 
            'funny': 1, 
            'text': 1 } }
])

In [5]:
yelp_x = []
yelp_y = []
for review in yelp_funny:
    yelp_x.append(review['text'])
    yelp_y.append(1 if review['funny'] > 0 else 0)

Data is not very balanced. Only about 20% are funny.

In [6]:
num_funny = np.array(yelp_y).sum()
num_funny

16587

Delete some not funny reviews to balance the categories

In [7]:
not_funny_indexes = [i for i, j in enumerate(yelp_y) if j == 0] 

In [8]:
not_funny_to_delete = not_funny_indexes[num_funny:]
len(not_funny_to_delete)

46826

In [9]:
not_funny_to_delete.sort(reverse=True)

In [10]:
for a in not_funny_to_delete:
    del yelp_x[a]
    del yelp_y[a]

In [11]:
print("{} {}".format(len(yelp_x), len(yelp_y)))

33174 33174


Shuffle the lists so that there aren't too many funny ones in test

In [12]:
import random

c = list(zip(yelp_x, yelp_y))
random.shuffle(c)
yelp_x, yelp_y = zip(*c)
yelp_x = list(yelp_x)
yelp_y = list(yelp_y)

In [13]:
from keras.preprocessing.text import Tokenizer

In [14]:
top_words = 10000

In [15]:
t = Tokenizer(top_words)
foo = t.fit_on_texts(yelp_x)

In [16]:
max_length = max([len(s.split()) for s in yelp_x])
max_length

998

In [17]:
X = t.texts_to_sequences(yelp_x)

In [18]:
X = sequence.pad_sequences(X, maxlen=max_length, padding='post')

In [19]:
split = round(len(X) *.8)
split

26539

In [20]:
X_train = X[:split - 1]
X_test = X[split:]
Y_train = yelp_y[:split -1]
Y_test = yelp_y[split:]

## Part 1b: CNN for sentence classification
Adapted from https://github.com/Theo-/sentiment-analysis-keras-conv/blob/master/train_keras.py

In [21]:
# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 998, 300)          3000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 998, 64)           57664     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 998, 32)           6176      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 998, 16)           1552      
_________________________________________________________________
flatten_1 (Flatten)          (None, 15968)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 15968)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 180)               2874420   
__________

In [22]:
model.fit(X_train, Y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11a3ea390>

In [23]:
# Evaluation on the test set
scores = model.evaluate(X_test, Y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 61.63%


## Part 2b: RNN for sentence classification
Adapted from https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [24]:
# LSTM with dropout for sequence classification

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 998, 32)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12b708fd0>

In [26]:
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 50.41%


## Part 1c: Extending CNN with NLP features

There are many ways to extend the Yelp review data with NLP features. One example that may help our task would be adding sentime scores to each word, forming tokens like the following: 'good_3', 'bad_1', etc., where each token is paried with a score. We could then feed these modified tokens into the CNN and again classify as funny or not funny. Similarly, we could add other charactersitics like dependency structures, named entity tags, or parts of speech tokens. This last one, using parts of speec tags, is what we do below. Basically, we use NLTL to match every word in our data to a parts of speech tag, and then tokenize the sentences and feed them into the CNN model.


Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. Apply parts of speech tagging to add additional information, and classidy ussing a convolutional neural net. 

In [27]:
import nltk

In [28]:
def merge_token_pos(tagged):
    if (tagged[1] == '.'):
        return(tagged[0])
    else:
        return('{}_{}'.format(tagged[0], tagged[1]))

def apply_tags(list_of_sentences):
    tagged = []
    for sentence in list_of_sentences:
        tokenized = nltk.word_tokenize(sentence)
        with_pos_tags = [merge_token_pos(tag) for tag in nltk.pos_tag(tokenized)]
        tagged.append(" ".join(with_pos_tags))
    return(tagged)
        
yelp_x_tagged = apply_tags(yelp_x)

In [29]:
# Should be the same size
print('Non-tagged: {}\tTagged: {}'.format(len(yelp_x), len(yelp_x_tagged)))

Non-tagged: 33174	Tagged: 33174


In [30]:
yelp_x_tagged[:5]

["It_PRP would_MD n't_RB be_VB the_DT weekend_NN without_IN Cora_NNP 's_POS . The_DT food_NN is_VBZ always_RB excellent_JJ and_CC the_DT staff_NN are_VBP welcoming_VBG and_CC efficient_JJ . The_DT tough_JJ part_NN is_VBZ deciding_VBG whether_IN to_TO go_VB savory_NN or_CC sweet_NN . The_DT loaded_JJ potatoes_NNS and_CC fresh_JJ fruit_NN are_VBP the_DT perfect_JJ accompaniment_NN .",
 "SW_NNP Eye_NNP Center_NNP is_VBZ terrible_JJ . The_DT people_NNS there_EX are_VBP very_RB rude_JJ and_CC if_IN you_PRP ever_RB have_VBP to_TO have_VB retinal_JJ surgery_NN ,_, avoid_VBP Dr._NNP Adelberg_NNP at_IN all_DT costs_NNS . I_PRP had_VBD retinal_JJ surgery_NN there_RB three_CD years_NNS ago_RB for_IN a_DT macular_JJ hole_NN and_CC I_PRP still_RB do_VBP not_RB have_VB eyesight_VBN in_IN that_DT eye_NN . Other_JJ eye_NN doctors_NNS have_VBP told_VBN me_PRP I_PRP would_MD have_VB had_VBD a_DT much_RB better_RBR result_NN if_IN I_PRP had_VBD used_VBN them_PRP and_CC one_CD described_VBD Adelberg_NNP '

In [31]:
X = t.texts_to_sequences(yelp_x)
X = sequence.pad_sequences(X, maxlen=max_length, padding='post')

# Define split
split = round(len(X) *.8)
split

# Actually split the data
X_train = X[:split - 1]
X_test = X[split:]
Y_train = yelp_y[:split -1]
Y_test = yelp_y[split:]

In [32]:
# Using embedding from Keras
embedding_vecor_length = 300
model_tags = Sequential()
model_tags.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model_tags.add(Convolution1D(64, 3, padding='same'))
model_tags.add(Convolution1D(32, 3, padding='same'))
model_tags.add(Convolution1D(16, 3, padding='same'))
model_tags.add(Flatten())
model_tags.add(Dropout(0.2))
model_tags.add(Dense(180,activation='sigmoid'))
model_tags.add(Dropout(0.2))
model_tags.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model_tags.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_tags.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 998, 300)          3000000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 998, 64)           57664     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 998, 32)           6176      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 998, 16)           1552      
_________________________________________________________________
flatten_2 (Flatten)          (None, 15968)             0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 15968)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 180)               2874420   
__________

In [33]:
model_tags.fit(X_train, Y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x132490c18>

In [34]:
# Final evaluation of the model
scores = model_tags.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 58.15%


## Part 2c: Extending RNN with NLP features

In [35]:
# LSTM with dropout for sequence classification

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 998, 32)           320000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12b6c9470>

In [37]:
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 49.59%
