![data-x](http://oi64.tinypic.com/o858n4.jpg)


# EXTRA MATERIAL: 
## Example, getting more than 90% accuracy on the IMDB data set

Source: https://richliao.github.io/supervised/classification/2016/11/26/textclassifier-convolutional/

Written up for Data-X by: Alexander Fred Ojala

** Written for Python 2.7 **

In [1]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os

os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

Using Theano backend.


In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data_train = pd.read_csv('./labeledTrainData.tsv', sep='\t')
print data_train.shape

texts = []
labels = []

(25000, 3)


In [138]:
for idx in range(data_train.review.shape[0]):
    labels.append(data_train.sentiment[idx])
    



labels = to_categorical(np.asarray(labels))

In [142]:
print(labels[:10])
print(data_train['sentiment'][:10])

[[ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]]
0    1
1    1
2    0
3    0
4    1
5    1
6    0
7    0
8    0
9    1
Name: sentiment, dtype: int64


In [3]:
for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx],'lxml')
    texts.append(clean_str(text.get_text().encode('ascii','ignore')))
    labels.append(data_train.sentiment[idx])
    

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set ')
print y_train.sum(axis=0)
print y_val.sum(axis=0)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Found 80566 unique tokens.
('Shape of data tensor:', (25000, 1000))
('Shape of label tensor:', (25000, 2))
Number of positive and negative reviews in traing and validation set 
[  9890.  10110.]
[ 2610.  2390.]


In [64]:
random.seed(150) # for reproducibility

GLOVE_DIR = "/Users/FO/data/glove.6B/" # add your own glove.6B directory
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

Total 400000 word vectors in Glove 6B 100d.


In [65]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

optim = keras.optimizers.RMSprop(lr=0.0005, rho=0.9, epsilon=1e-08, decay=0.0)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer=optim,
              metrics=['acc'])

In [66]:
print("model fitting - simplified convolutional neural network")
model.summary()
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=20, batch_size=128)



model fitting - simplified convolutional neural network
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 1000, 100)     8056700     input_6[0][0]                    
____________________________________________________________________________________________________
convolution1d_22 (Convolution1D) (None, 996, 128)      64128       embedding_4[0][0]                
____________________________________________________________________________________________________
maxpooling1d_22 (MaxPooling1D)   (None, 199, 128)      0           convolution1d_22[0][0]           
___________________________________

<keras.callbacks.History at 0x11fbd8690>

In [68]:
print(model.evaluate(x_val,y_val))
# accuracy 89%

[0.7777312150835991, 0.88959999999999995]


In [69]:
mod_preds = model.predict(x_val)

In [93]:
print 'Validation accuracy:',np.mean(mod_preds.round()==y_val)*100,'% (never reached 90% at any iteration)'
# Not there yet

Validation accuracy: 88.96 % (never reached 90% at any iteration)


In [136]:
print(train_data)

[[ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]]


# More complex CNN

In [81]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [97]:
import random
seed = 150
np.random.seed(seed)
import keras


# applying a more complex convolutional approach
convs = []
filter_sizes = [3,4,5]

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = MaxPooling1D(5)(l_conv)
    convs.append(l_pool)
    
l_merge = Merge(mode='concat', concat_axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(2, activation='softmax')(l_dense)

model2 = Model(sequence_input, preds)
optim = keras.optimizers.RMSprop(lr=0.00095, rho=0.9, epsilon=1e-08, decay=0.0)
model2.compile(loss='categorical_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])


model2.summary()


# Checkpoint the weights for best model on validation accuracy
from keras.callbacks import ModelCheckpoint
# checkpoint
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

print("model fitting - more complex convolutional neural network")

model2.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=50, callbacks=callbacks_list)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_6 (Embedding)          (None, 1000, 100)     8056700     input_9[0][0]                    
____________________________________________________________________________________________________
convolution1d_35 (Convolution1D) (None, 998, 128)      38528       embedding_6[2][0]                
____________________________________________________________________________________________________
convolution1d_36 (Convolution1D) (None, 997, 128)      51328       embedding_6[2][0]                
___________________________________________________________________________________________

<keras.callbacks.History at 0x13156e110>

In [99]:
best_model = Model(sequence_input, preds)
optim = keras.optimizers.RMSprop(lr=0.00095, rho=0.9, epsilon=1e-08, decay=0.0)
best_model.compile(loss='categorical_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])



In [100]:
best_model.load_weights('weights.best.hdf5')

In [101]:
print(best_model.evaluate(x_val,y_val)) # 90% accuracy in third Epoch!

[0.40374871463775636, 0.90139999999999998]


In [103]:
best_mod_preds = best_model.predict(x_val)

In [104]:
print 'Validation accuracy:',np.mean(best_mod_preds.round()==y_val)*100,'% (success!)'

Validation accuracy: 90.14 % (success!)


# Just for fun -- create a Kaggle submission from the unlabeled data
Gets over 90% on the test data

In [125]:
# just for fun -- create a Kaggle submission from the unlabeled data

# Read the test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print test.shape

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])


(25000, 2)


In [126]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()
texts = []
labels = []

In [127]:
for idx in range(test.review.shape[0]):
    text = BeautifulSoup(test.review[idx],'lxml')
    texts.append(clean_str(text.get_text().encode('ascii','ignore')))
    

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 79661 unique tokens.


In [128]:
print('Shape of data tensor:', data.shape)

x_test = data
print(x_test.shape)

('Shape of data tensor:', (25000, 1000))
(25000, 1000)


In [129]:
test_preds = best_model.predict(x_test,verbose=1)

In [133]:
test_preds_round = test_preds.round()
print(test_preds_round[:10]) # left col = 1 gives 0 (for positive review), right col = 1 gives 1 for neg review

[[ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]]


In [146]:
result = [int(x) for x in test_preds_round[:,1]]

In [147]:
result[:10]

[1, 0, 0, 0, 0, 0, 1, 1, 0, 0]

In [149]:
test_preds = model2.predict(x_test,verbose=1)
result = [int(x) for x in test_preds_round[:,1]]



In [150]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "result2.csv", index=False, quoting=3 )