In [None]:
from __future__ import print_function


import io, os
import urllib2

import numpy as np
np.random.seed(1337)  # for reproducibility

In [None]:
from nltk import word_tokenize
from keras.preprocessing import sequence
from passage.preprocessing import Tokenizer

red = '\033[01;31m'
native = '\033[m'

## Get some data

First lets download some textfiles to play with.

In [None]:
def get_and_save_file(url, filename):
    response = urllib2.urlopen(url)
    with io.open(filename, 'w', encoding='utf8') as fout:
        fout.write(response.read().decode('utf-8'))

input_lemmatized_url = 'https://raw.githubusercontent.com/alvations/stubboRNNess/master/cwi_inputs.lemmatized.txt'
output_label_url = 'https://raw.githubusercontent.com/alvations/stubboRNNess/master/cwi_labels.txt'

# let's download the file first.
get_and_save_file(input_lemmatized_url, 'cwi_inputs.lemmatized.txt')
get_and_save_file(output_label_url, 'cwi_labels.txt')

## Now, we take a look at what's inside.

In [None]:
with io.open('cwi_inputs.lemmatized.txt', 'r', encoding='utf8') as fin:
    for line in fin:
        line = line.strip()
        
        print (red + 'Line:' + native, end='\n')
        print (line, end='\n\n')
        print (red + 'Focus Word:' + native, end='\n')
        print (line.split(' <s> ')[0], end='\n\n')
        print (red + 'Context:' + native, end='\n')
        print (line.split(' <s> ')[1], end='\n\n')
        break
        
with io.open('cwi_labels.txt', 'r', encoding='utf8') as fin:
    for line in fin:
        print (red + 'Label:' + native, end='\n')
        line = line.strip()
        print (line, end='\n\n')
        break

## Converting strings to ids

In [None]:
with io.open('cwi_inputs.lemmatized.txt', 'r', encoding='utf8') as fin:
    # There's repeating sentences in this dataset, thus the list(set()).
    train_text = list(set([line.split(' <s> ')[1].lower().strip() for line in fin]))

In [None]:
tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)

In [None]:
# This will show you the mapping from tokens to their "dictionary" IDs
tokenizer.encoder

In [None]:
# This will show you the mapping from the "dictionary" IDs to the tokens.
tokenizer.decoder

In [None]:
# Let's look at a sentence.
train_text[0]

In [None]:
# This is how you convert a SINGLE sentence to their IDs.
tokenizer.transform(train_text[0:1])

In [None]:
# This is how you convert a list of sentences to their IDs.
tokenizer.transform(train_text[3:5])

## Limiting the vocabulary size

We can limit the vocabulary size of our tokenizer by using the `max_features` parameter.

In [None]:
tokenizer = Tokenizer(max_features = 20)
train_tokens = tokenizer.fit_transform(train_text)

In [None]:
tokenizer.transform(train_text[0:1])

In [None]:
tokenizer.decoder

## The IMDB dataset

Now that we know how to preprocess our corpus into IDs quickly, we'll look at a pre-processed dataset used for sentiment analysis. 

In [None]:
from keras.datasets import imdb

max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                      test_split=0.2)

In [None]:
# Lets take a look at the sentences.
X_train[:2] 

In [None]:
print ([len(_d) for _d in X_train[:2]])

In [None]:
y_train[:2]

In [None]:
X_test[:2]

In [None]:
y_test[:2]

## Now what we want to do is remove pathologically long sentences.

In [None]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
X_train[:2]

In [None]:
# Let's try redoing pad_sequences()
max_features = 20000
maxlen = 170  # cut texts after this number of words (among top max_features most common words)

(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                      test_split=0.2)
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
X_train[:2]

In [None]:
print ([len(_d) for _d in X_train[:2]])