In [1]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
data = []
with open("train.ft.txt","r") as f:
    for i in f:
        data.append(i)

In [3]:
NB_WORDS = 50000  # Parameter indicating the number of words we'll put in the dictionary
NB_START_EPOCHS = 26  # Number of epochs we usually start to train with
BATCH_SIZE = 512  #

In [4]:
def get_labels(data) :
    labels = []
    for i in data :
        if(((i.split()[0]).replace("__label__",""))=='1'):
            labels.append([1,0])
        else :
            labels.append([0,1])
    return labels

In [5]:
labels = get_labels(data)

In [None]:
(data[0].replace("__label__",""))[2:]

In [6]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

In [None]:
'!'.isalpha()

In [7]:
def remove_label(data):
    d = []
    for i in data :
        d.append((i.replace("__label__",""))[2:])
    return d

In [8]:
data = remove_label(data)

In [None]:
data[10]

In [9]:
data = data[:20000]
labels = labels[:20000]

In [10]:
def cleaning_sentences(data):    
    for i in range(len(data)):
        data[i] = remove_stopwords(data[i])

In [11]:
cleaning_sentences(data)

In [None]:
data[0]

In [12]:
data = np.array(data)
labels = np.array(labels)

In [13]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ", 
               char_level=False)
tk.fit_on_texts(data)

In [14]:
tokenizer = TreebankWordTokenizer()
training = []
for i in range(len(data)):
    training.append(tokenizer.tokenize(data[i]))
for i in range(len(training)):
    training[i] = [x.lower() for x in training[i]]

In [15]:
tk.word_index['<PAD>']=NB_WORDS+1
tk.word_index['<UNK>']=0
tk.word_index['n\'t'] = NB_WORDS+2

In [16]:
punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
train_X =[]
for i in range(len(training)):
    sentence = []
    for j in range(len(training[i])):
        if training[i][j] in punct:
            pass
        else:
            sentence.append(training[i][j])
    train_X.append(sentence)

In [17]:
MAX_SEQ = 120
for s in range(len(train_X)):
    n = MAX_SEQ - len(train_X[s])
    if n < 0:
        train_X[s] = train_X[s][:MAX_SEQ]
    else:
        for i in range(n):
            train_X[s].append('<PAD>')
    for v in range(len(train_X[s])):
        if train_X[s][v] not in tk.word_index:
            train_X[s][v] = tk.word_index['<UNK>']
        else:
            train_X[s][v] = tk.word_index[train_X[s][v]]

In [18]:
reverse_word_map = dict(map(reversed, tk.word_index.items()))

In [19]:
train_X = np.array(train_X)
train_X.shape

(20000, 120)

In [25]:
embedding_size=32
model=Sequential()
model.add(Embedding(NB_WORDS+3, embedding_size, input_length=MAX_SEQ))
model.add(Bidirectional(LSTM(100, dropout=0.8)))
# model.add(Dense(20, kernel_regularizer=regularizers.l1_l2(0.01)))
model.add(Dense(2, activation='softmax'))
print(model.summary())

model.compile(optimizer='adam'
              , loss='binary_crossentropy'
              , metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 120, 32)           1600096   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               106400    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 1,706,898
Trainable params: 1,706,898
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
checkpoint = ModelCheckpoint('weight_sentiment_amazon.{epoch:02d}.hdf5')
callbacks_list = [checkpoint]

In [22]:
X_valid = train_X[:BATCH_SIZE]
Y_valid = labels[:BATCH_SIZE]
train_X = train_X[BATCH_SIZE:]
y_train_oh = labels[BATCH_SIZE:]

In [23]:
train_X = np.array(train_X)
X_valid = np.array(X_valid)
Y_valid = np.array(Y_valid)
y_train_oh = np.array(y_train_oh)


In [None]:
train_X[0]
#X_valid.shape


In [27]:
history = model.fit(train_X, y_train_oh, validation_data=(X_valid, Y_valid), callbacks=callbacks_list, epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE)

Train on 19488 samples, validate on 512 samples
Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26


In [28]:
max(history.history['val_acc'])

0.884765625

In [31]:
def prep_data(inp):
    punct = ['!','"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
         '[', '/', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n']
    #neg = negate_sequence(inp)
    for i in punct:
        inp.replace(i, '')
    tokenizer = TreebankWordTokenizer()
    inp = tokenizer.tokenize(inp)
    MAX_SEQ = 120
    n = MAX_SEQ - len(inp)
    if n < 0:
        inp = inp[:MAX_SEQ]
    else:
        for i in range(n):
            inp.append('<PAD>')
    for v in range(len(inp)):
        if inp[v] not in tk.word_index:
            inp[v] = tk.word_index['<UNK>']
        else:
            inp[v] = tk.word_index[inp[v]]
    return np.reshape(np.array(inp) , (1 , MAX_SEQ))

In [46]:
model.predict(prep_data("Hugely Disappointing: I've read all the previous entries in the Grantville Universe and enjoyed most of them quite a bit. But where the use of various writers and short stories worked well for a time, the quality of the writing has been steadily decreasing.The Ram Rebellion reads as a book totally abandoned by Eric Flint and handled almost entirely by the Regulars at Baen's Bar. I understand Baen's practice of pairing up-and-coming writers with more established writers in order to develop new talent and further the plot of popular series. It's worked well in the past.Here it simply failed. I'm about 1/5 through the book and am so tired of the lackluster writing, the sophomoric humor and the terribly slow plot progression that I'm putting it away for good. I'm probably unlikely to read the rest of the series because it's reached a point where having Flint's name on the cover is totally meaningless and not at all representative of the quality of writing to be expected within."))

array([[ 0.01202024,  0.98797977]], dtype=float32)

In [38]:
data_test = []
with open("train.ft.txt","r") as f:
    for i in f:
        data_test.append(i)

In [45]:
data_test[2000]

"__label__1 Hugely Disappointing: I've read all the previous entries in the Grantville Universe and enjoyed most of them quite a bit. But where the use of various writers and short stories worked well for a time, the quality of the writing has been steadily decreasing.The Ram Rebellion reads as a book totally abandoned by Eric Flint and handled almost entirely by the Regulars at Baen's Bar. I understand Baen's practice of pairing up-and-coming writers with more established writers in order to develop new talent and further the plot of popular series. It's worked well in the past.Here it simply failed. I'm about 1/5 through the book and am so tired of the lackluster writing, the sophomoric humor and the terribly slow plot progression that I'm putting it away for good. I'm probably unlikely to read the rest of the series because it's reached a point where having Flint's name on the cover is totally meaningless and not at all representative of the quality of writing to be expected within.

In [None]:
dataa = []
with open("train.ft.txt","r") as f:
    for i in f:
        dataa.append(i)
        
len(dataa)