In [1]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import Model
from keras import layers
from keras import regularizers
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
data = []
j = 0
with open("train.ft.txt","r") as f:
    for i in f:
        if j == 30000:
            break
        data.append(i)
        j+=1

In [3]:
NB_WORDS = 50000  # Parameter indicating the number of words we'll put in the dictionary
NB_START_EPOCHS = 26  # Number of epochs we usually start to train with
BATCH_SIZE = 512  #

In [4]:
def get_labels(data) :
    labels = []
    for i in data :
        if(((i.split()[0]).replace("__label__",""))=='1'):
            labels.append([1,0])
        else :
            labels.append([0,1])
    return labels

In [62]:
def binarytoneutral(data) :
    labels = []
    for i in data:
        if list(i)==[1,0]:
            labels.append([1,0,0])
        else :
            labels.append([0,0,1])
    return labels

In [5]:
labels = get_labels(data)

In [6]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

In [7]:
def remove_label(data):
    d = []
    for i in data :
        d.append((i.replace("__label__",""))[2:].replace("\n",""))
    return d

In [8]:
data = remove_label(data)

In [9]:
data[10]

"The Worst!: A complete waste of time. Typographical errors, poor grammar, and a totally pathetic plot add up to absolutely nothing. I'm embarrassed for this author and very disappointed I actually paid for this book."

In [10]:
def cleaning_sentences(data):    
    for i in range(len(data)):
        data[i] = remove_stopwords(data[i])

In [11]:
cleaning_sentences(data)

In [12]:
data[0]

'Stuning even non-gamer: This sound track beautiful! It paints senery mind well would recomend even people hate vid. game music! played game Chrono Cross games ever played best music! It backs away crude keyboarding takes fresher step grate guitars soulful orchestras. It would impress anyone cares listen! ^_^'

In [13]:
data = np.array(data)
labels = np.array(labels)

In [18]:
embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        
        coefs = [float(i) for i in values[1:]]
        embeddings_index[word] = coefs

embeddings_index['<PAD>'] = [0] * 300
embeddings_index['<UNK>'] = [1] * 300

In [19]:
tokenizer = TreebankWordTokenizer()
training = []
for i in range(len(data)):
    training.append(tokenizer.tokenize(data[i]))
for i in range(len(training)):
    training[i] = [x.lower() for x in training[i]]

In [20]:
punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
train_X =[]
for i in range(len(training)):
    sentence = []
    for j in range(len(training[i])):
        if training[i][j] in punct:
            pass
        else:
            sentence.append(training[i][j])
    train_X.append(sentence)

In [17]:
MAX_SEQ = 120
for s in range(len(train_X)):
    n = MAX_SEQ - len(train_X[s])
    if n < 0:
        train_X[s] = train_X[s][:MAX_SEQ]
    else:
        for i in range(n):
            train_X[s].append('<PAD>')
    for v in range(len(train_X[s])):
        if train_X[s][v] not in embeddings_index:
            train_X[s][v] = embeddings_index['<UNK>']
        else:
            train_X[s][v] = embeddings_index[train_X[s][v]]

In [21]:
#train_X = train_X[:20000]

In [18]:
train_X = np.array(train_X)
train_X.shape

(30000, 120, 300)

In [None]:
embedding_size=32
model=Sequential()
model.add(Embedding(NB_WORDS+3, embedding_size, input_length=MAX_SEQ))
model.add(Bidirectional(LSTM(100, dropout=0.8)))
# model.add(Dense(20, kernel_regularizer=regularizers.l1_l2(0.01)))
model.add(Dense(2, activation='softmax'))
print(model.summary())

model.compile(optimizer='adam'
              , loss='binary_crossentropy'
              , metrics=['accuracy'])

In [12]:
MAX_SEQ = 120
input_layer = Input( batch_shape = (None, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ, dropout = 0.25, recurrent_dropout=0.25))(input_layer)
x = Dropout(0.25)(lstm_layer)
merged = Dense(units=20, activation='relu')(x)
merged = Dropout(0.25)(merged)
merged = BatchNormalization()(merged)
output_layer = Dense(2, activation="softmax")(merged)

model = Model(inputs=input_layer, outputs=output_layer)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 120, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 240)               404160    
_________________________________________________________________
dropout_1 (Dropout)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                4820      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 20)                80        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 42        
Total para

In [13]:
model.compile(optimizer='adam'
              , loss='binary_crossentropy'
              , metrics=['accuracy'])

In [26]:
checkpoint = ModelCheckpoint('weight_sentiment_amazon.{epoch:02d}.hdf5')
callbacks_list = [checkpoint]

In [27]:
X_valid = train_X[:BATCH_SIZE]
Y_valid = labels[:BATCH_SIZE]
train_X = train_X[BATCH_SIZE:]
y_train_oh = labels[BATCH_SIZE:]

In [28]:
train_X = np.array(train_X)
X_valid = np.array(X_valid)
Y_valid = np.array(Y_valid)
y_train_oh = np.array(y_train_oh)


In [None]:
train_X[0]
#X_valid.shape


In [35]:
history = model.fit(train_X, y_train_oh, validation_data=(X_valid, Y_valid), callbacks=callbacks_list, epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE)

Train on 29488 samples, validate on 512 samples
Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26


In [36]:
max(history.history['val_acc'])

0.90625

In [14]:
model.load_weights("weight_sentiment_amazon.26.hdf5")

In [15]:
def prep_data(inp):
    punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
    #neg = negate_sequence(inp)
    for i in punct:
        inp.replace(i, '')
    tokenizer = TreebankWordTokenizer()
    inp = tokenizer.tokenize(inp)
    MAX_SEQ = 120
    n = MAX_SEQ - len(inp)
    if n < 0:
        inp = inp[:MAX_SEQ]
    else:
        for i in range(n):
            inp.append('<PAD>')
    for v in range(len(inp)):
        if inp[v] not in embeddings_index:
            inp[v] = embeddings_index['<UNK>']
        else:
            inp[v] = embeddings_index[inp[v]]
    return np.reshape(np.array(inp) , (1, MAX_SEQ, 300))

In [16]:
sent = "the food was cold but I liked the taste."

In [21]:
model.predict(prep_data(sent))

array([[ 0.41501242,  0.58498764]], dtype=float32)

In [22]:
data_test = []
with open("test.ft.txt","r") as f:
    for i in f:
        data_test.append(i)

In [23]:
test_labels = get_labels(data_test)

In [24]:
data_test = remove_label(data_test)

In [25]:
cleaning_sentences(data_test)

In [26]:
data_test = np.array(data_test)
test_labels = np.array(test_labels)

In [27]:
tokenizer = TreebankWordTokenizer()
testing = []
for i in range(len(data_test)):
    testing.append(tokenizer.tokenize(data_test[i]))
for i in range(len(testing)):
    testing[i] = [x.lower() for x in testing[i]]

In [28]:
punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
test_X =[]
for i in range(len(testing)):
    sentence = []
    for j in range(len(testing[i])):
        if testing[i][j] in punct:
            pass
        else:
            sentence.append(testing[i][j])
    test_X.append(sentence)

In [29]:
testing_X = test_X

In [30]:
test_X = test_X[:8000]

In [31]:
MAX_SEQ = 120
for s in range(len(test_X)):
    n = MAX_SEQ - len(test_X[s])
    if n < 0:
        test_X[s] = test_X[s][:MAX_SEQ]
    else:
        for i in range(n):
            test_X[s].append('<PAD>')
    for v in range(len(test_X[s])):
        if test_X[s][v] not in embeddings_index:
            test_X[s][v] = embeddings_index['<UNK>']
        else:
            test_X[s][v] = embeddings_index[test_X[s][v]]

In [33]:
model.evaluate(np.array(test_X),np.array(test_labels[:8000]))



[0.29108472535014152, 0.89012500000000006]

# Testing twitter model on Amazon data

In [36]:
MAX_SEQ = 20
input_layer = Input( batch_shape = (None, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ, dropout = 0.25, recurrent_dropout=0.25))(input_layer)
x = Dropout(0.25)(lstm_layer)
merged = Dense(units=20, activation='relu')(x)
merged = Dropout(0.25)(merged)
merged = BatchNormalization()(merged)
output_layer = Dense(3, activation="softmax")(merged)

model_twitter = Model(inputs=input_layer, outputs=output_layer)
model_twitter.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 20, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 40)                51360     
_________________________________________________________________
dropout_5 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                820       
_________________________________________________________________
dropout_6 (Dropout)          (None, 20)                0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 20)                80        
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 63        
Total para

In [67]:
model_twitter.compile(optimizer='adam'
              , loss='categorical_crossentropy'
              , metrics=['accuracy'])

In [37]:
model_twitter.load_weights("weight_twitter_embedding.32.hdf5")

In [63]:
labels_twitter = np.array(binarytoneutral(test_labels))

In [69]:
test_twitter = []
for i in test_X:
    test_twitter.append(i[:20])

In [70]:
model_twitter.evaluate(np.array(test_twitter),np.array(labels_twitter[:8000]))



[1.2336834318637848, 0.65800000000000003]