In [1]:
%matplotlib inline
from __future__ import print_function, division
import utils; reload(utils)
from utils import *
from IPython.display import FileLink

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5103)
Using Theano backend.


In [2]:
data_path = 'data/rt/'
sample_path = 'data/rt/sample/'
path = data_path
model_path = path + 'models/'
results_path = path + 'results/'
home_dir = os.path.realpath('.')
for p in [path, model_path, sample_path, results_path]:
    if not os.path.exists(p):
        os.makedirs(p)

### Download and Unzip Dataset

In [None]:
# Download
%cd {path}
!kg download -c 'sentiment-analysis-on-movie-reviews'
%cd {home_dir}

In [None]:
# Unzip training labels
import zipfile
def unzip_file(filename, path, outputpath=None):
    outputpath = outputpath or path
    filepath = path + filename
    if not os.path.exists(outputpath):
        os.makedirs(outputpath)
    if os.path.isfile(filepath):
        zip_ref = zipfile.ZipFile(filepath, 'r')
        zip_ref.extractall(outputpath)
        zip_ref.close()
        print('Unzipping file:', filepath)

unzip_file('test.tsv.zip', path)
unzip_file('train.tsv.zip', path)

### Prepare Dataset - index words

In [3]:
reviews = pd.read_csv(path+'train.tsv', sep='\t')

In [4]:
print(reviews.shape)
reviews[:5]

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
full_phrase_idx = reviews.groupby('SentenceId')['PhraseId'].min()
# print(full_phrase_idx.values)
full_phrases_df = reviews.loc[reviews['PhraseId'].isin(full_phrase_idx.values)]
print(type(full_phrases_df))

<class 'pandas.core.frame.DataFrame'>


In [6]:
# train_df = full_phrases_df.copy()
train_df = reviews.copy()

train_phrases = train_df['Phrase']
norm_phrases = map(str.lower, train_phrases.tolist())

norm_phrases[:5]

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'a series of escapades demonstrating the adage that what is good for the goose',
 'a series',
 'a',
 'series']

## Future improvements:

Use keras.preprocessing.text.Tokenizer instead of manually tokenizing everything

### Word to index

In [7]:
from collections import Counter
def sorted_wordlist(phrases):
    sentencelist = map(str.split, phrases)
    flat_list = [word for sublist in sentencelist for word in sublist]
    wordcounts = Counter(flat_list)
    print('Words:', len(wordcounts))
    wordlist = [x[0] for x in wordcounts.most_common()]
    return wordlist

In [8]:
wordlist = sorted_wordlist(norm_phrases)

Words: 16531


In [9]:
# index should start at 1. 0 is for padding
word2idx = {word: idx+1 for idx, word in enumerate(wordlist)}
idx2word = {idx+1: word for idx, word in enumerate(wordlist)}
len(idx2word)

16531

In [10]:
from functools import partial

def map_sentence2idx(sentences, word2idx):
    return map(partial(sentence2idx, word2idx), sentences)
# Map each sentence to phrases
def sentence2idx(wordMap, sentence):
    words = sentence.split()
    def map_word2idx(word):
        if word in wordMap:
            return wordMap[word]
        return len(wordMap)
    return map(map_word2idx, words)

idx_sentencelist = map_sentence2idx(norm_phrases, word2idx)

In [11]:
# Look at distribution of lengths of sentences
lens = np.array(map(len, idx_sentencelist))
(lens.max(), lens.min(), lens.mean())

(52, 0, 7.2033640907343326)

In [12]:
def norm_idx(wordidx, vocab_size=8000, seq_len=55):
    limit_idx = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in wordidx]
#     print(len(limit_idx))
    padded_idx = sequence.pad_sequences(limit_idx, maxlen=seq_len, value=0)
    return [np.array(x) for x in padded_idx]

In [13]:
# Limit vocab size
vocab_size = 8000
# Pad (with zero) or truncate to max sentence length
seq_len = 55

normalized_word_idx = norm_idx(idx_sentencelist, vocab_size, seq_len)

In [14]:
train_df['word_idx'] = pd.Series(normalized_word_idx, index=train_df.index)

In [15]:
train_df[:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,word_idx
0,1,1,A series of escapades demonstrating the adage ...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,1,A series,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,1,A,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,1,series,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,6,1,of escapades demonstrating the adage that what...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,7,1,of,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,8,1,escapades demonstrating the adage that what is...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,9,1,escapades,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,10,1,demonstrating the adage that what is good for ...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Save DF

In [None]:
train_df = pd.read_pickle(results_path+'train_idx.p')

In [None]:
pd.to_pickle(train_df, results_path+'train_idx.p')

### Creating training and test set

In [161]:
perm = np.random.permutation(train_df.SentenceId.unique())
train_sentence_ids = perm[:int(len(perm)*.8)]
train = train_df.loc[train_df.SentenceId.isin(train_sentence_ids)]
train = train.sample(frac=1)
test = shuffle.loc[~train_df.index.isin(train.index)]

In [168]:
(train.shape, test.shape)

((124885, 5), (31175, 5))

In [163]:
x_train = np.array(train.word_idx.tolist())
labels_train = onehot(np.array(train.Sentiment.tolist()))
x_test = np.array(test.word_idx.tolist())
labels_test = onehot(np.array(test.Sentiment.tolist()))

In [164]:
(x_train.shape, labels_train.shape)

((124885, 55), (124885, 5))

### Load Cached Data

In [None]:
trn = get_data(path+'train')
val = get_data(path+'valid')

In [None]:
save_array(results_path+'val.dat', val)
save_array(results_path+'trn.dat', trn)

In [None]:
val = load_array(results_path+'val.dat')
trn = load_array(results_path+'trn.dat')

### Simple NN

In [None]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=seq_len),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(labels_train.shape[1], activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

### Conv layer

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=seq_len, dropout=0.2),
    Dropout(0.2),
    Conv1D(nb_filter=32, filter_length=5, border_mode='same', activation='relu'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(labels_train.shape[1], activation='softmax')
    
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=5, batch_size=64)

### Using pretrained embeddings

In [19]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [20]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [113]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.100d'))

Downloading data from http://files.fast.ai/models/glove/6B.100d.tgz
Untaring file...


In [114]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word in wordidx and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            print('Could not find word in glove:', word)
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [None]:
emb = create_emb()

In [None]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, dropout=0.05,
              weights=[emb], trainable=False),
    BatchNormalization(),
#     Dropout(0.15),
    Convolution1D(64, 6, border_mode='same', activation='relu'),
    BatchNormalization(),
#     Dropout(0.15),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    BatchNormalization(),
#     Dropout(0.3),
    Dense(5, activation='sigmoid')
])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [None]:
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=4, batch_size=64)

In [None]:
model.optimizer.lr = 0.01
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=4, batch_size=64)

In [None]:
# Train first layer
model.layers[0].trainable = True
model.optimizer.lr = 1e-4
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

### Using multi-convnets

In [171]:
latent_factors = 100
graph_in = Input((vocab_size, latent_factors))
convs = [ ]
for fsz in range(2, 6):
    x = Convolution1D(100, fsz, border_mode='same', activation='relu')(graph_in)
    x = MaxPooling1D(pool_length=2)(x)
    x = Flatten()(x)
    convs.append(x)
# out = keras.layers.Merge(convs, mode='concat')
out = merge(convs, mode='concat')

graph = Model(graph_in, out)

In [172]:
emb = create_emb()

Could not find word in glove: ,
Could not find word in glove: .
Could not find word in glove: 's
Could not find word in glove: n't
Could not find word in glove: '
Could not find word in glove: -rrb-
Could not find word in glove: -lrb-
Could not find word in glove: `
Could not find word in glove: ...
Could not find word in glove: ``
Could not find word in glove: ''
Could not find word in glove: 're
Could not find word in glove: :
Could not find word in glove: 've
Could not find word in glove: 'll
Could not find word in glove: ;
Could not find word in glove: ?
Could not find word in glove: mr.
Could not find word in glove: 'd
Could not find word in glove: !
Could not find word in glove: 'm
Could not find word in glove: ms.
Could not find word in glove: &
Could not find word in glove: vs.
Could not find word in glove: $
Could not find word in glove: 'em
Could not find word in glove: '70s
Could not find word in glove: \/
Could not find word in glove: j.
Could not find word in glove: '60s
C

In [173]:
model = Sequential([
    Embedding(vocab_size, latent_factors, input_length=seq_len, dropout=.2, weights=[emb]),
#     BatchNormalization(),
    Dropout(0.1),
    graph,
    BatchNormalization(),
    Dropout(0.1),
    Dense(100, activation='relu'),
    BatchNormalization(),
    Dropout(0.15),
    Dense(70, activation='relu'),
    BatchNormalization(),
    Dropout(0.15),
    Dense(5, activation='sigmoid')
])

In [174]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=5, batch_size=64)

Train on 124885 samples, validate on 31175 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0a3fcae890>

In [175]:
model.optimizer.lr = 1e-4
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

Train on 124885 samples, validate on 31175 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0a941eab90>

In [132]:
# Train first layer
model.layers[0].trainable = True
model.optimizer.lr = 1e-5
model.fit(x_train, labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

Train on 124848 samples, validate on 31212 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0a8c738b10>

In [182]:
model.save_weights(model_path+'conv1-627pct.h5')

### Submission

In [72]:
test_df = pd.read_csv(path+'test.tsv', sep='\t')
print(test_df.head())

# test_phrases = train_df['Phrase'][:10]
test_phrases = test_df['Phrase']
norm_test_phrases = map(str.lower, test_phrases.tolist())

test_idx_sentencelist = map_sentence2idx(norm_test_phrases, word2idx)
test_word_idx = norm_idx(test_idx_sentencelist, vocab_size, seq_len)

   PhraseId  SentenceId                                             Phrase
0    156061        8545  An intermittently pleasing but mostly routine ...
1    156062        8545  An intermittently pleasing but mostly routine ...
2    156063        8545                                                 An
3    156064        8545  intermittently pleasing but mostly routine effort
4    156065        8545         intermittently pleasing but mostly routine


In [74]:
np.array(test_word_idx).shape

(66292, 55)

In [75]:
# print(np.array(test_word_idx).shape)
# predictions = model.predict_classes(np.array(test_word_idx), batch_size=60)

In [76]:
import math
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm

# def batchify(sentences, batch_size):
#     itertools.islice(batch, batch_size)


def split_every(n, iterable):
    i = iter(iterable)
    piece = list(itertools.islice(i, n))
    while piece:
        yield np.array(piece)
        piece = list(itertools.islice(i, n))
        
def predict_with_progress(model, sentences, batch_size):
    num_samples = sentences.shape[0]
    batches = split_every(batch_size, sentences)
    p_results = np.zeros((num_samples,)+model.output_shape[1:])
    current_index = 0
    # Iterative loop
    for batch in tqdm(batches, total=math.ceil(num_samples/batch_size)):
        if batch is None:
            break
        if type(batch) is tuple:
            batch = batch[0]
        p = model.predict_on_batch(batch)
        p_size = p.shape[0]
#         print('Predictions: {}\n Size: {}'.format(p_true, p_size))
        new_index = current_index + p_size
#         print('Current index: {} New index: {} PResults: {}'.format(current_index, new_index, p_results))
        p_results[current_index:new_index] = p
        current_index = new_index
        if current_index >= num_samples:
            break
    return p_results

In [193]:
predictions = predict_with_progress(model, np.array(test_word_idx), batch_size=60)

In [194]:
cat_predictions = predictions.argmax(axis=1)

In [79]:
(cat_predictions.shape, test_df.PhraseId.shape)

((66292,), (66292,))

In [195]:
agg = pd.DataFrame({'PhraseId': test_df.PhraseId, 'Sentiment': cat_predictions})
# agg = agg[agg.columns[::-1]]
agg.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [196]:
agg.to_csv(path+'submission2.csv', index=False)

In [197]:
!kg submit {path+'submission2.csv'} -c 'sentiment-analysis-on-movie-reviews'


1099/|/ 99%|| 1099/1105.0 [00:25<00:00, 43.87it/s]

## Pseudo labeling

In [190]:
ps_x_train = np.concatenate([x_train, np.array(test_word_idx)])
ps_labels_train = np.concatenate([labels_train, onehot(cat_predictions)])


In [184]:
(x_train.shape, np.array(test_word_idx).shape)

(124885, 55)

In [189]:
(labels_train.shape, onehot(cat_predictions).shape)

((124885, 5), (66292, 5))

In [192]:
model.fit(ps_x_train, ps_labels_train, validation_data=(x_test, labels_test), nb_epoch=2, batch_size=64)

Train on 191177 samples, validate on 31175 samples
Epoch 1/1


<keras.callbacks.History at 0x7f0a934f9390>