In [1]:
%matplotlib inline
from __future__ import print_function, division
import utils; reload(utils)
from utils import *
from IPython.display import FileLink

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5103)
Using Theano backend.


In [2]:
data_path = 'data/rt/'
sample_path = 'data/rt/sample/'
path = data_path
model_path = path + 'models/'
results_path = path + 'results/'
home_dir = os.path.realpath('.')
for p in [path, model_path, sample_path, results_path]:
    if not os.path.exists(p):
        os.makedirs(p)

### Download and Unzip Dataset

In [None]:
# Download
%cd {path}
!kg download -c 'sentiment-analysis-on-movie-reviews'
%cd {home_dir}

In [None]:
# Unzip training labels
import zipfile
def unzip_file(filename, path, outputpath=None):
    outputpath = outputpath or path
    filepath = path + filename
    if not os.path.exists(outputpath):
        os.makedirs(outputpath)
    if os.path.isfile(filepath):
        zip_ref = zipfile.ZipFile(filepath, 'r')
        zip_ref.extractall(outputpath)
        zip_ref.close()
        print('Unzipping file:', filepath)

unzip_file('test.tsv.zip', path)
unzip_file('train.tsv.zip', path)

### Prepare Dataset - index words

In [3]:
reviews = pd.read_csv(path+'train.tsv', sep='\t')

In [None]:
print(reviews.shape)
reviews[:5]

In [None]:
full_phrase_idx = reviews.groupby('SentenceId')['PhraseId'].min()
# print(full_phrase_idx.values)
full_phrases_df = reviews.loc[reviews['PhraseId'].isin(full_phrase_idx.values)]
print(type(full_phrases_df))

In [None]:
# train_df = full_phrases_df.copy()
train_df = reviews.copy()

train_phrases = train_df['Phrase']
norm_phrases = map(str.lower, train_phrases.tolist())

norm_phrases[:5]

### Tokenizer

In [None]:
from keras.preprocessing.text import Tokenizer
nb_words=8000

### Word to index

In [None]:
from collections import Counter
def sorted_wordlist(phrases):
    sentencelist = map(str.split, phrases)
    flat_list = [word for sublist in sentencelist for word in sublist]
    wordcounts = Counter(flat_list)
    print('Words:', len(wordcounts))
    wordlist = [x[0] for x in wordcounts.most_common()]
    return wordlist

In [None]:
wordlist = sorted_wordlist(norm_phrases)

In [None]:
# index should start at 1. 0 is for padding
word2idx = {word: idx+1 for idx, word in enumerate(wordlist)}
idx2word = {idx+1: word for idx, word in enumerate(wordlist)}
len(idx2word)

In [None]:
from functools import partial

def map_sentence2idx(sentences, word2idx):
    return map(partial(sentence2idx, word2idx), sentences)
# Map each sentence to phrases
def sentence2idx(wordMap, sentence):
    words = sentence.split()
    def map_word2idx(word):
        if word in wordMap:
            return wordMap[word]
        return len(wordMap)
    return map(map_word2idx, words)

idx_sentencelist = map_sentence2idx(norm_phrases, word2idx)

In [None]:
# Look at distribution of lengths of sentences
lens = np.array(map(len, idx_sentencelist))
(lens.max(), lens.min(), lens.mean())

In [None]:
def norm_idx(wordidx, vocab_size=8000, seq_len=55):
    limit_idx = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in wordidx]
#     print(len(limit_idx))
    padded_idx = sequence.pad_sequences(limit_idx, maxlen=seq_len, value=0)
    return [np.array(x) for x in padded_idx]

In [None]:
# Limit vocab size
vocab_size = 8000
# Pad (with zero) or truncate to max sentence length
seq_len = 55

normalized_word_idx = norm_idx(idx_sentencelist, vocab_size, seq_len)

In [None]:
train_df['word_idx'] = pd.Series(normalized_word_idx, index=train_df.index)

In [None]:
train_df[:10]

### Save DF

In [None]:
train_df = pd.read_pickle(results_path+'train_idx.p')

In [None]:
pd.to_pickle(train_df, results_path+'train_idx.p')

### Creating training and test set

In [None]:
shuffle = train_df.sample(frac=1)
train = shuffle.sample(frac=0.8)
test = shuffle.loc[~shuffle.index.isin(train.index)]

In [None]:
x_train = np.array(train.word_idx.tolist())
labels_train = onehot(np.array(train.Sentiment.tolist()))
x_test = np.array(test.word_idx.tolist())
labels_test = onehot(np.array(test.Sentiment.tolist()))

In [None]:
(x_train.shape, labels_train.shape)

### Load Cached Data

In [None]:
trn = get_data(path+'train')
val = get_data(path+'valid')

In [None]:
save_array(results_path+'val.dat', val)
save_array(results_path+'trn.dat', trn)

In [None]:
val = load_array(results_path+'val.dat')
trn = load_array(results_path+'trn.dat')

### Simple NN

In [None]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(labels_train.shape[1], activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

### Conv layer

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=seq_len, dropout=0.2),
    Dropout(0.2),
    Conv1D(nb_filter=32, filter_length=5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(labels_train.shape[1], activation='softmax')
    
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=5, batch_size=64)

### Using pretrained embeddings

In [None]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [None]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [None]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

In [None]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word in wordidx and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            print('Could not find word in glove:', word)
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [None]:
emb = create_emb()

In [None]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.05,
              weights=[emb], trainable=False),
    BatchNormalization(),
#     Dropout(0.15),
    Convolution1D(64, 6, border_mode='same', activation='relu'),
    BatchNormalization(),
#     Dropout(0.15),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    BatchNormalization(),
#     Dropout(0.3),
    Dense(5, activation='sigmoid')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=4, batch_size=64)

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=4, batch_size=64)

In [None]:
# Train first layer
model.layers[0].trainable = True
model.optimizer.lr = 1e-4
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

### Using multi-convnets

In [None]:
graph_in = Input((vocab_size, 50))
convs = [ ]
for fsz in range(3, 6):
    x = Convolution1D(64, fsz, border_mode='same', activation='relu')(graph_in)
    x = MaxPooling1D()(x)
    x = Flatten()(x)
    convs.append(x)
out = merge(convs, mode='concat')
# Merge even more convnets using Keras.Merge?
graph = Model(graph_in, out)

In [None]:
emb = create_emb()

In [None]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=.1, weights=[emb]),
#     BatchNormalization(),
    Dropout(0.05),
    graph,
    BatchNormalization(),
    Dropout(0.1),
    Dense(50, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(50, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),
    Dense(5, activation='sigmoid')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=6, batch_size=64)

In [None]:
model.optimizer.lr = 1e-4
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=4, batch_size=64)

In [None]:
# Train first layer
model.layers[0].trainable = True
model.optimizer.lr = 1e-5
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

### Submission

In [None]:
test_df = pd.read_csv(path+'test.tsv', sep='\t')
print(test_df.head())

# test_phrases = train_df['Phrase'][:10]
test_phrases = train_df['Phrase']
norm_test_phrases = map(str.lower, test_phrases.tolist())

test_idx_sentencelist = map_sentence2idx(norm_test_phrases, word2idx)
test_word_idx = norm_idx(test_idx_sentencelist, vocab_size, seq_len)

In [None]:
test_df.shape

In [None]:
print(np.array(test_word_idx).shape)
predictions = model.predict_classes(np.array(test_word_idx), batch_size=60)

In [None]:
import math
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm

def predict_with_progress(model, sentences, batch_size):
    num_samples = sentences.shape[0]
    batch_size = batches.batch_size

    p_results = np.zeros((num_samples,))
    current_index = 0
    # Iterative loop
    for batch in tqdm(batches, total=math.ceil(num_samples/batch_size)):
        if batch is None:
            break
        if type(batch) is tuple:
            batch = batch[0]
        p = model.predict_on_batch(batch)
        p_size = p.shape[0]
#         print('Predictions: {}\n Size: {}'.format(p_true, p_size))
        new_index = current_index + p_size
#         print('Current index: {} New index: {} PResults: {}'.format(current_index, new_index, p_results))
        p_results[current_index:new_index] = p
        current_index = new_index
        if current_index >= num_samples:
            break
    return p_results

In [None]:
predictions.argmax(axis=1)

In [None]:
predictions

In [None]:
agg = pd.DataFrame({'PhraseId': test_df.PhraseId, 'Sentiment': predictions})
# agg = agg[agg.columns[::-1]]