In [None]:
#Dataset class

import pandas as pd
import numpy as np
from keras.preprocessing import text, sequence

vocab_size = 20000


class SentimentData():
    def __init__(self, train_file, test_file, max_len=100):
        print("Reading files...")
        self.train = pd.read_csv(train_file, sep='\t')
        self.test = pd.read_csv(test_file, sep='\t')
        one_hot = pd.get_dummies(self.train['Sentiment'], prefix='sentiment')

        self.train = self.train.join(one_hot)
        
        self.train_phrases = self.train["Phrase"].fillna("EMPTY").values
        self.test_phrases = self.test["Phrase"].fillna("EMPTY").values

        self.classes_names = ["sentiment_0", "sentiment_1", "sentiment_2", "sentiment_3", "sentiment_4"]
        
        self.y = np.array(self.train[self.classes_names].values)
        
        self.tokenizer = text.Tokenizer(num_words=vocab_size)
        self.tokenizer.fit_on_texts(list(self.train_phrases))

        print("Tokenizing train set...")
        self.x = np.array(self.tokenizer.texts_to_sequences(self.train_phrases))
        
        print("Tokenizing test set...")
        self.train_x = np.array(self.tokenizer.texts_to_sequences(self.test_phrases))
    
        print("Padding sequences...")
        self.x = np.array(sequence.pad_sequences(self.x, maxlen=max_len))
        self.train_x = np.array(sequence.pad_sequences(self.train_x, maxlen=max_len))

In [None]:
data = SentimentData('./train.tsv', './test.tsv')

In [None]:
from keras.models import Sequential
from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout, LSTM, Embedding, GlobalMaxPool1D, Bidirectional

def init_model():
    model = Sequential([
        Embedding(vocab_size, 128, input_shape=(100,)),
        Bidirectional(LSTM(50, return_sequences=True)),
        GlobalMaxPool1D(),
        Dropout(.1),
        Dense(50, activation="relu"),
        Dropout(.1),
        Dense(5, activation="softmax"),
    ])
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model


In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_callback = EarlyStopping(monitor='acc', min_delta=0.001, patience=1)
save_callback = ModelCheckpoint('./sentiment.model', save_best_only=True, monitor='acc', save_weights_only=True)


model = init_model()

model.fit(data.x, data.y, batch_size=32, epochs=1000, validation_split=0.1, callbacks=[early_callback, save_callback])

In [None]:
#Save tokenizer

import pickle

with open('tok.pckl', 'wb') as f:
    pickle.dump(data.tok, f)