## Default import

In [20]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import tensorflow as tf

from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense

from slacker import Slacker

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3381833774549083369
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7524958208
locality {
  bus_id: 1
}
incarnation: 11321584302467032114
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:03:00.0, compute capability: 6.1"
]


In [22]:
slack = Slacker('xoxp-554173958562-554173959170-555244937223-1f3cfc06ff8cc48d3a2ea00e6c682a7c')

if slack.api.test().successful:
    print(
        f"Connected to {slack.team.info().body['team']['name']}.")
else:
    print('Try Again!')
        
def report_stats(text, channel):
    """Report training stats"""
    r = slack.chat.post_message(channel=channel, text=text,
                                username='Code Report',
                                icon_emoji=':running:')

Connected to anneke@iitml.


## Default functions

In [3]:
def open_pickle(path):
    import pickle
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [4]:
X_train_sequence = open_pickle('../../data/imdb/X_tr_sample_original.pkl')
X_test_sequence = open_pickle('../../data/imdb/X_te_sample_original.pkl')
y_train_target = open_pickle('../../data/imdb/y_tr_target_original.pkl')
y_test_target = open_pickle('../../data/imdb/y_te_target_original.pkl')

In [5]:
# y_train_target[1000:1100]

In [6]:
GLOVE_DIR = "../../data/glove.6B/"
GLOVE_DIM = 100

def extract_glove_index(file):
    embeddings_index = {}
    f = open(os.path.join(GLOVE_DIR, file), 'r')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

embeddings_index = extract_glove_index('glove.6B.100d.txt')
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [7]:
max_encoder_seq_length = 81
max_decoder_seq_length = 5

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_sequence)

X_tr_padded = pad_sequences(tokenizer.texts_to_sequences(X_train_sequence), maxlen=81, padding='post', truncating='post')
y_tr_padded = pad_sequences(tokenizer.texts_to_sequences(y_train_target), maxlen=5, padding='post', truncating='post')

encoder_input_data = np.zeros(
    (len(X_train_sequence), max_encoder_seq_length, GLOVE_DIM),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(y_train_target), max_decoder_seq_length, GLOVE_DIM),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(y_train_target), max_decoder_seq_length, len(tokenizer.word_index)),
    dtype='float32')

train_sequence = []
target_sequence = []

for sample in X_train_sequence:
    train_sequence.append(sample.split())
for target in y_train_target:
    target_sequence.append(target.split())
    
# 100-dim -> input sequence, input decoder
# 42K-dim -> output sequence.

for i, (input_text, target_text, target_padded) in enumerate(zip(train_sequence, target_sequence, y_tr_padded)):
    for t, word in enumerate(input_text):
        try:
            encoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
    
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            decoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
        
    for t, word in enumerate(target_padded):
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, word] = 1.
    

In [20]:
X_tr_padded[5]

array([  34,   78,   38,   40,   12,  375,    1, 4285,  215,    2,    7,
        455,    9,   83, 2190,    5,    7,  291,    1,  153,  215,   62,
         25,   83,  442,  781,   43,   39,    4,   24,   60,   57,  246,
         76,   12,  328,    6,  475,   18,   10,  214,  781,    9,    1,
         98,  249,  953,   69,   33,  218,   67,   22,    3,  391,    5,
        126, 2220,   83, 4580,  705,  100,   62,   69,   49,  706,  111,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0], dtype=int32)

### Global Variable

In [8]:
LATENT_DIM = 100
NUM_ENCODER_TOKENS = np.max(X_tr_padded)
NUM_DECODER_TOKENS = np.max(X_tr_padded)
max_encoder_seq_length = X_tr_padded.shape[1]
max_decoder_seq_length = X_tr_padded.shape[1]

In [9]:
NUM_DECODER_TOKENS

42406

In [10]:
len(tokenizer.word_index)

42406

### Train the model 

In [2]:
# from keras.models import Model
# from keras.layers import Input, LSTM, Dense


# encoder_inputs = Input(shape=(None, GLOVE_DIM))
# encoder = LSTM(LATENT_DIM, return_state=True)
# encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# encoder_states = [state_h, state_c]

# decoder_inputs = Input(shape=(None, GLOVE_DIM))
# decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
#                                      initial_state=encoder_states)
# decoder_dense = Dense(NUM_DECODER_TOKENS, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# model.compile(optimizer='adam', loss='categorical_crossentropy')

from keras.models import Model, load_model

model = load_model('100_glove_s2s_overfit.h5')

Using TensorFlow backend.


In [3]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, 100), (None, 80400       input_5[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 100),  80400       input_6[0][0]                    
                                                                 lstm_5[0][1]                     
          

In [22]:
# epochs = 500
# batch_size = 128

In [23]:
# model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=batch_size,
#           epochs=epochs,
#           verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 

KeyboardInterrupt: 

In [46]:
# save model

# model.save('100_glove_s2s.h5')

### Inference

In [11]:
# sampling models

# https://nlp.stanford.edu/~johnhew/public/14-seq2seq.pdf
# https://medium.com/machine-learning-bites/deeplearning-series-sequence-to-sequence-architectures-4c4ca89e5654

model = load_model('100_glove_s2s_overfit.h5')

encoder_inputs = model.input[0]
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]
decoder_state_input_h = Input(shape=(LATENT_DIM,), name='input_3')
decoder_state_input_c = Input(shape=(LATENT_DIM,), name='input_4')

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm = model.layers[3]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
                    [decoder_inputs] + decoder_states_inputs,
                    [decoder_outputs] + decoder_states
                    )



In [12]:
word_index = tokenizer.word_index
reverse_word_index = dict((i,word) for word,i in word_index.items())

### Candidate Scoring

In [13]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def target_index(doc_idx, candidate_seq, y):
    for i,j in enumerate(candidate_seq):
        if len(intersection(j, y)) == len(y):
            return i
    return -1

# doc num, doc index argmax

def to_sequence(int_sequence):
    decoded = ''
    for i,intnum in enumerate(int_sequence):
        if intnum == 0:
            word = '<PAD>'
        else:
            word = reverse_word_index[intnum]
        
        if i == len(int_sequence):
            decoded += word
        else:
            decoded += word + ' '
    return decoded

def rouge_one(true, candidate, start_index):
    
    if isinstance(true, str) and isinstance(candidate, str):
        true = true.split()
        candidate = candidate.split()
    
    overlap = [value for value in true[start_index:] if value in candidate[start_index:]] 

    
    if len(true[start_index:]) != 0:
        recall = len(overlap)/len(true[start_index:])
    else:
        recall = 0
    
    if len(candidate[start_index:]):
        precision = len(overlap)/len(candidate[start_index:])
    else:
        precision = 0
    
    if (recall+precision) != 0:    
        f1 = 2*((recall*precision)/(recall+precision))
    else:
        f1 = 0
    
    return recall, precision, f1

In [14]:
index_word = tokenizer.index_word

In [15]:
# Play with candidate

def decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, GLOVE_DIM))
    
    candidate_token_index = candidate_target_seq[0,0]
    try:
        from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
    except KeyError as error:
        pass
    
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        
        candidate_token_index = candidate_target_seq[0,i]
        from_candidate_target_seq = np.zeros((1,1,GLOVE_DIM))
        try:
            from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
        except KeyError as error:
            pass
        
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

In [16]:
def generate_candidate_list(X):
    y_candidate = []
    
    for i in range(X.shape[0]-1-5):
        y_candidate.append(X[i:i+5])
    
    return np.asarray(y_candidate)

In [17]:
# get the overlap words. 
t = [0, 5, 6, 7]
s = [0, 6, 7, 3]

len(intersection(t,s))/len(t)

0.75

### Get overlap here

In [25]:
i = 0

# r = Rouge()
file = open("candidate_jll_glove_100_f1_update.csv", "w")

In [26]:
for doc in X_tr_padded:
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_tr_padded[i])
#     print(y_candidate)
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)

    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    # get recall here
    [precision, recall, f_score] = rouge_one(y_train_target[i], to_sequence(y_candidate[max_jll_index]), 1)
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n' %(i, true_target_index, y_train_target[i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_tr_padded[i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4),
                                                            precision, recall, f_score))
    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))

#     print('%s\t%s\t%.1f\n' %(y_train_target[i], to_sequence(y_candidate[max_jll_index]), precision, recall))
    if i % 1000 == 0:
#         print('Processing document %d...' %(i))
        msg = 'glove 100: processing document ' + str(i)
        report_stats(msg, 'deep-learning')
        print(msg)
        
    i += 1
    
file.close()
report_stats('Processing DONE', 'deep-learning')



glove 100: processing document 0
glove 100: processing document 1000
glove 100: processing document 2000
glove 100: processing document 3000
glove 100: processing document 4000
glove 100: processing document 5000
glove 100: processing document 6000
glove 100: processing document 7000
glove 100: processing document 8000
glove 100: processing document 9000
glove 100: processing document 10000
glove 100: processing document 11000
glove 100: processing document 12000
glove 100: processing document 13000
glove 100: processing document 14000
glove 100: processing document 15000
glove 100: processing document 16000
glove 100: processing document 17000
glove 100: processing document 18000
glove 100: processing document 19000
glove 100: processing document 20000
glove 100: processing document 21000
glove 100: processing document 22000


IndexError: index 42406 is out of bounds for axis 2 with size 42406

In [43]:
# file.close()

In [None]:
# from keras.callbacks import CSVLogger

# csv_logger = CSVLogger('training.log')

### Test on Test data

In [42]:
# start test data preprocessing

# X_te = open_pickle('../../data/imdb_sequence/3000_one_hot/X_te_seq_set.pkl')
# y_te = open_pickle('../../data/imdb_sequence/3000_one_hot/y_te_seq_set.pkl')

In [30]:
max_encoder_seq_length = 81
max_decoder_seq_length = 5

X_te_padded = pad_sequences(tokenizer.texts_to_sequences(X_test_sequence), maxlen=81, padding='post', truncating='post')
y_te_padded = pad_sequences(tokenizer.texts_to_sequences(y_test_target), maxlen=5, padding='post', truncating='post')

test_encoder_input_data = np.zeros(
    (len(X_test_sequence), max_encoder_seq_length, GLOVE_DIM),
    dtype='float32')
test_decoder_input_data = np.zeros(
    (len(y_test_target), max_decoder_seq_length, GLOVE_DIM),
    dtype='float32')
test_decoder_target_data = np.zeros(
    (len(y_test_target), max_decoder_seq_length, len(tokenizer.word_index)),
    dtype='float32')

test_sequence = []
test_target_sequence = []

for sample in X_test_sequence:
    test_sequence.append(sample.split())
for target in y_test_target:
    test_target_sequence.append(target.split())
    
# 100-dim -> input sequence, input decoder
# 42K-dim -> output sequence.

for i, (input_text, target_text, target_padded) in enumerate(zip(test_sequence, test_target_sequence, y_tr_padded)):
    for t, word in enumerate(input_text):
        try:
            test_encoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
    
    for t, word in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        try:
            test_decoder_input_data[i, t, :] = embeddings_index[word]
        except KeyError as error:
            continue
        
    for t, word in enumerate(target_padded):
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            test_decoder_target_data[i, t - 1, word] = 1.
    

In [31]:
# Play with candidate

def test_decode_sequence_target(candidate_states_value, candidate_target_seq):
#     candidate_states_value = encoder_model.predict(input_seq)

    from_candidate_target_seq = np.zeros((1,1, GLOVE_DIM))
    
    candidate_token_index = candidate_target_seq[0,0]
    try:
        from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
    except KeyError as error:
        pass
    
    candidate_joint_log_prob = 0
    
    for i in range(1,5):
        from_candidate_output_tokens, h_true, c_true = decoder_model.predict([from_candidate_target_seq] + candidate_states_value)
    
        candidate_target_prob = from_candidate_output_tokens[0,-1, candidate_target_seq[0,i]]
        candidate_joint_log_prob += np.log(candidate_target_prob)
        
        # get the t+1 input
        
        candidate_token_index = candidate_target_seq[0,i]
        from_candidate_target_seq = np.zeros((1,1,GLOVE_DIM))
        try:
            from_candidate_target_seq[0,0,:] = embeddings_index[index_word[candidate_token_index]]
        except KeyError as error:
            pass
        
        
        candidate_states_value = [h_true, c_true]

    return candidate_joint_log_prob, candidate_target_prob

### Run Test

In [34]:
# i = 15380
# start = 15380
file = open("test_candidate_jll_glove_100_f1_update.csv", "w")

In [35]:
for i,doc in enumerate(X_te_padded):
    y_candidate = generate_candidate_list(doc)
    
    candidate_jll_per_doc = []
    input_seq = test_encoder_input_data[i:i+1]
    
    true_target_index = target_index(i, y_candidate, y_te_padded[i])
    
    # Encode
    candidate_states_value = encoder_model.predict(input_seq)
    
    for j in range(y_candidate.shape[0]):
        candidate_seq = y_candidate[j:j+1]
        candidate_jll_slide, candidate_last_prob = test_decode_sequence_target(candidate_states_value, candidate_seq)
        candidate_jll_per_doc.append(candidate_jll_slide)
   
    candidate_jll_per_doc = np.asarray(candidate_jll_per_doc)
    max_jll_index = np.argmax(candidate_jll_per_doc)
    true_target_jll = np.around(candidate_jll_per_doc[true_target_index],5)
    max_candidate_jll = np.around(candidate_jll_per_doc[max_jll_index],5)
    
    [precision, recall, f_score] = rouge_one(y_test_target[i], to_sequence(y_candidate[max_jll_index]), 1)
    
    file.write('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n' %(i, true_target_index, y_test_target[i],
                                                            max_jll_index, to_sequence(y_candidate[max_jll_index]),
                                                            -(true_target_index-max_jll_index),
                                                            true_target_jll, max_candidate_jll,
                                                            np.absolute(true_target_jll-max_candidate_jll),
                                                            len(intersection(y_te_padded[i], y_candidate[max_jll_index])),
                                                            np.exp(true_target_jll/4), np.exp(max_candidate_jll/4),
                                                            precision, recall, f_score))
    
    

    
#     print('%d\t%d\t%s\t%d\t%s\t%d\t%.5f\t%.5f\t%.5f\t%d\n' %(i, true_target_index, y['text'][i],
#                                                             max_jll_index, to_sequence(y_candidate[max_jll_index]),
#                                                             -(true_target_index-max_jll_index),
#                                                             true_target_jll, max_candidate_jll,
#                                                             np.absolute(true_target_jll-max_candidate_jll),
#                                                             len(intersection(y['padded'][i], y_candidate[max_jll_index]))))
    if i % 1000 == 0:
#         print('Processing document %d...' %(i))
        msg = 'glove 100 test: processing document ' + str(i)
        report_stats(msg, 'deep-learning')
        print(msg)
        
#     i += 1
    
file.close()
report_stats('Processing DONE', 'deep-learning')



glove 100 test: processing document 0
glove 100 test: processing document 1000
glove 100 test: processing document 2000
glove 100 test: processing document 3000
glove 100 test: processing document 4000
glove 100 test: processing document 5000
glove 100 test: processing document 6000
glove 100 test: processing document 7000
glove 100 test: processing document 8000
glove 100 test: processing document 9000
glove 100 test: processing document 10000
glove 100 test: processing document 11000
glove 100 test: processing document 12000
glove 100 test: processing document 13000
glove 100 test: processing document 14000
glove 100 test: processing document 15000
glove 100 test: processing document 16000
glove 100 test: processing document 17000
glove 100 test: processing document 18000
glove 100 test: processing document 19000
glove 100 test: processing document 20000
glove 100 test: processing document 21000
glove 100 test: processing document 22000


In [49]:
file.close()