# Assignment 3
Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
import pymongo

## Prepare the data 
The yelp data was imported into a collection called 'docs' in a MongoDB database called 'yelp'. We pulled a random sample of 10000.

In [5]:
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['yelp']

In [6]:
num_docs = 80000

In [12]:
yelp_funny = db.reviews.aggregate([
    { '$match': {
            'funny': {
                '$exists': True } } }, 
    { '$sample': { 
            'size': num_docs } }, 
    { '$project': {
            '_id': 0, 
            'funny': 1, 
            'text': 1 } }
])

In [13]:
yelp_x = []
yelp_y = []
for review in yelp_funny:
    yelp_x.append(review['text'])
    yelp_y.append(1 if review['funny'] > 0 else 0)

Data is not very balanced. Only about 20% are funny.

In [14]:
num_funny = np.array(yelp_y).sum()
num_funny

16316

Delete some not funny reviews to balance the categories

In [15]:
not_funny_indexes = [i for i, j in enumerate(yelp_y) if j == 0] 

In [16]:
not_funny_to_delete = not_funny_indexes[num_funny:]
len(not_funny_to_delete)

47368

In [17]:
not_funny_to_delete.sort(reverse=True)

In [18]:
for a in not_funny_to_delete:
    del yelp_x[a]
    del yelp_y[a]

In [19]:
print("{} {}".format(len(yelp_x), len(yelp_y)))

32632 32632


Shuffle the lists so that there aren't too many funny ones in test

In [20]:
import random

c = list(zip(yelp_x, yelp_y))
random.shuffle(c)
yelp_x, yelp_y = zip(*c)
yelp_x = list(yelp_x)
yelp_y = list(yelp_y)

In [21]:
from keras.preprocessing.text import Tokenizer

In [22]:
top_words = 10000

In [23]:
t = Tokenizer(top_words)
foo = t.fit_on_texts(yelp_x)

In [24]:
max_length = max([len(s.split()) for s in yelp_x])
max_length

1005

In [25]:
X = t.texts_to_sequences(yelp_x)

In [26]:
X = sequence.pad_sequences(X, maxlen=max_length, padding='post')

In [27]:
split = round(len(X) *.8)
split

26106

In [28]:
X_train = X[:split - 1]
X_test = X[split:]
Y_train = yelp_y[:split -1]
Y_test = yelp_y[split:]

## Part 1b: CNN for sentence classification
Adapted from https://github.com/Theo-/sentiment-analysis-keras-conv/blob/master/train_keras.py and https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [29]:
# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1005, 300)         3000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1005, 64)          57664     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1005, 32)          6176      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1005, 16)          1552      
_________________________________________________________________
flatten_1 (Flatten)          (None, 16080)             0         
_________________________________________________________________
dropout_1 (Dropout)  

In [30]:
model.fit(X_train, Y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 61.91%


In [33]:
# Evaluation on the test set
scores = model.evaluate(X_test, Y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 61.91%


## Part 2: RNN for sentence classification
Adapted from https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. 

In [35]:
# LSTM with dropout for sequence classification

# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1005, 32)          320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 373,301
Trainable params: 373,301
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x12d014d30>

In [37]:
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 49.40%


## Part 1.C: Extending CNN with NLP features

There are many ways to extend the Yelp review data with NLP features. One example that may help our task would be adding sentime scores to each word, forming tokens like the following: 'good_3', 'bad_1', etc., where each token is paried with a score. We could then feed these modified tokens into the CNN and again classify as funny or not funny. Similarly, we could add other charactersitics like dependency structures, named entity tags, or parts of speech tokens. This last one, using parts of speec tags, is what we do below. Basically, we use NLTL to match every word in our data to a parts of speech tag, and then tokenize the sentences and feed them into the CNN model.


Goal: classify [Yelp reviews](https://www.yelp.com/dataset) tagged as funny by at least one user. Apply parts of speech tagging to add additional information, and classidy ussing a convolutional neural net. 

In [41]:
import nltk

In [59]:
def merge_token_pos(tagged):
    if (tagged[1] == '.'):
        return(tagged[0])
    else:
        return('{}_{}'.format(tagged[0], tagged[1]))

def apply_tags(list_of_sentences):
    tagged = []
    for sentence in list_of_sentences:
        tokenized = nltk.word_tokenize(sentence)
        with_pos_tags = [merge_token_pos(tag) for tag in nltk.pos_tag(tokenized)]
        tagged.append(" ".join(with_pos_tags))
    return(tagged)
        
yelp_x_tagged = apply_tags(yelp_x)

In [60]:
# Should be the same size
print('Non-tagged: {}\tTagged: {}'.format(len(yelp_x), len(yelp_x_tagged)))

Non-tagged: 32632	Tagged: 32632


In [61]:
yelp_x_tagged[:5]

["Yuck_NN ! This_DT place_NN needs_VBZ to_TO blown_VB up_RB and_CC rebuilt_VB . Cigarette_NNP smoke_NN and_CC dirty_NN chairs_NNS throughout_IN the_DT casino_NN . They_PRP usually_RB only_RB have_VBP one_CD bar_NN open_JJ with_IN one_CD bartender_NN so_RB expect_JJ to_TO wait_VB in_IN line_NN ! So_RB many_JJ nice_JJ casino_NN 's_POS in_IN the_DT valley_NN ,_, do_VBP n't_RB bother_VB with_IN the_DT Fort_NNP",
 'Yorkdale_NN is_VBZ cleaner_JJR than_IN most_JJS shopping_NN malls_NNS and_CC is_VBZ definitely_RB stepping_VBG up_RP and_CC improving_VBG every_DT time_NN I_PRP go_VBP there_RB . It_PRP has_VBZ expanded_VBN quite_RB significantly_RB with_IN new_JJ store_NN additions_NNS as_RB well_RB as_IN parking_VBG lots_NNS . During_IN holidays_NNS it_PRP can_MD definitely_RB get_VB crowded_VBN so_RB I_PRP highly_RB suggest_VBP to_TO shop_VB as_RB early_RB as_IN possible_JJ . As_IN for_IN the_DT food_NN court_NN ,_, the_DT have_VBP change_NN a_DT few_JJ and_CC provides_VBZ a_DT variety_NN of_I

In [62]:
X = t.texts_to_sequences(yelp_x)
X = sequence.pad_sequences(X, maxlen=max_length, padding='post')

# Define split
split = round(len(X) *.8)
split

# Actually split the data
X_train = X[:split - 1]
X_test = X[split:]
Y_train = yelp_y[:split -1]
Y_test = yelp_y[split:]

In [63]:
# Using embedding from Keras
embedding_vecor_length = 300
model_tags = Sequential()
model_tags.add(Embedding(top_words, embedding_vecor_length, input_length=max_length))

# Convolutional model (3x conv, flatten, 2x dense)
model_tags.add(Convolution1D(64, 3, padding='same'))
model_tags.add(Convolution1D(32, 3, padding='same'))
model_tags.add(Convolution1D(16, 3, padding='same'))
model_tags.add(Flatten())
model_tags.add(Dropout(0.2))
model_tags.add(Dense(180,activation='sigmoid'))
model_tags.add(Dropout(0.2))
model_tags.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model_tags.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_tags.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1005, 300)         3000000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1005, 64)          57664     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1005, 32)          6176      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 1005, 16)          1552      
_________________________________________________________________
flatten_2 (Flatten)          (None, 16080)             0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 16080)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 180)               2894580   
__________

In [64]:
model_tags.fit(X_train, Y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1489f0c88>

In [65]:
# Final evaluation of the model
scores = model_tags.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 58.75%
