In [2]:
import os
import re
import tarfile
import tqdm

import requests

from pugnlp.futil import path_status, find_files
import numpy as np  # Keras takes care of most of this but it likes to see Numpy arrays
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D

In [3]:
# From the nlpia package for downloading data too big for the repo

BIG_URLS = {
    'w2v': (
        'https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1',
        1647046227,
    ),
    'slang': (
        'https://www.dropbox.com/s/43c22018fbfzypd/slang.csv.gz?dl=1',
        117633024,
    ),
    'tweets': (
        'https://www.dropbox.com/s/5gpb43c494mc8p0/tweets.csv.gz?dl=1',
        311725313,
    ),
    'lsa_tweets': (
        'https://www.dropbox.com/s/rpjt0d060t4n1mr/lsa_tweets_5589798_2003588x200.tar.gz?dl=1',
        3112841563,  # 3112841312,
    ),
    'imdb': (
        'https://www.dropbox.com/s/yviic64qv84x73j/aclImdb_v1.tar.gz?dl=1',
        3112841563,  # 3112841312,
    ),
}

In [4]:
# These functions are part of the nlpia package which can be pip installed and run from there.
def dropbox_basename(url):
    filename = os.path.basename(url)
    match = re.findall(r'\?dl=[0-9]$', filename)
    if match:
        return filename[:-len(match[0])]
    return filename

def download_file(url, data_path='.', filename=None, size=None, chunk_size=4096, verbose=True):
    """Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https"""
    if filename is None:
        filename = dropbox_basename(url)
    file_path = os.path.join(data_path, filename)
    if url.endswith('?dl=0'):
        url = url[:-1] + '1'  # noninteractive download
    if verbose:
        tqdm_prog = tqdm
        print('requesting URL: {}'.format(url))
    else:
        tqdm_prog = no_tqdm
    r = requests.get(url, stream=True, allow_redirects=True)
    size = r.headers.get('Content-Length', None) if size is None else size
    print('remote size: {}'.format(size))

    stat = path_status(file_path)
    print('local size: {}'.format(stat.get('size', None)))
    if stat['type'] == 'file' and stat['size'] == size:  # TODO: check md5 or get the right size of remote file
        r.close()
        return file_path

    print('Downloading to {}'.format(file_path))

    with open(file_path, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:  # filter out keep-alive chunks
                f.write(chunk)

    r.close()
    return file_path

def untar(fname):
    if fname.endswith("tar.gz"):
        with tarfile.open(fname) as tf:
            tf.extractall()
    else:
        print("Not a tar.gz file: {}".format(fname))

In [5]:
download_file(BIG_URLS['w2v'][0])

requesting URL: https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1
remote size: 1647046227
local size: None
Downloading to ./GoogleNews-vectors-negative300.bin.gz


'./GoogleNews-vectors-negative300.bin.gz'

In [6]:
untar(download_file(BIG_URLS['imdb'][0]))

requesting URL: https://www.dropbox.com/s/yviic64qv84x73j/aclImdb_v1.tar.gz?dl=1
remote size: 84125825
local size: None
Downloading to ./aclImdb_v1.tar.gz


In [3]:
import glob
import os

from random import shuffle

def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    
    pos_label = 1
    neg_label = 0
    
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
            
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

dataset = pre_process_data('./aclImdb/train')
print(dataset[0])

(1, 'Richard Chamberlain is David Burton, a tax lawyer living in Sydney, Australia who is drawn into a murder trial defending five Aboriginal men accused of murdering a fellow native in Peter Weir\'s apocalyptic 1977 thriller The Last Wave. Taking up where Picnic at Hanging Rock left off, the film goes deeper into exploring the unknown and, in the process, shows the gulf between two cultures who live side by side but lack understanding of each others culture and traditions. Weir shows how white society considers the native beliefs to be primitive superstitions and believes that since they are living in the cities and have been "domesticated", their tribal laws and culture no longer apply. <br /><br />From the start, Burton is drawn deeper and deeper into a strange web of visions and symbols where the line between real time and "dream time" evaporates. Water plays an important symbolic role in the film from the opening sequence in which a sudden thunder and hailstorm interrupts a peacef

In [22]:
len(dataset)

25000

In [26]:
dataset[16000]

(0,
 'I\'m glad I rented this movie for one reason: its shortcomings made me want to read Allende\'s book and get the full story. <br /><br />Pros: the movie is beautiful, the period is depicted well and consistently (to the best of my knowledge), and Meryl and Glenn do good jobs.<br /><br />Cons: This is the worst acting job I\'ve ever seen from Jeremy Irons--I kept wondering if something was wrong with his mouth. (And I hate the terribly English way he says "Transito.") Winona Ryder does nothing believable except look young and idealistic. Most of the other performances are OK, but so few things hang together in the character arcs and the relationship development that I was frustrated and angry well before the end. <br /><br />I\'m very curious now whether this movie is typical of Bille August\'s work. I may have to drop another couple of bucks to rent Smilla\'s Sense of Snow.')

# Data Status 1
We read in the imdb training data.  This is a balanced set 12,500 pos and 12,500 neg movie reviews.

The data is in a single list of tuples of format: (label(0 or 1), review(string))

The data has been shuffled using random.shuffle

In [5]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
                                                 binary=True,
                                                 limit=200000)

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab
            
        vectorized_data.append(sample_vecs)

    return vectorized_data

def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [7]:
%%time
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

CPU times: user 27.8 s, sys: 193 ms, total: 28 s
Wall time: 28 s


In [28]:
# Primary shape is still the number of documents (in this case movie reviews)  There are 25000
# second print line just proves that the label list is the same length
print(len(vectorized_data))
print(len(expected))

25000
25000


In [30]:
# average number of tokens per movie review
len(vectorized_data[1])

444

In [36]:
avgLength = sum([len(x) for x in vectorized_data])/len(vectorized_data)
maxLength = max([len(x) for x in vectorized_data])
minLength = min([len(x) for x in vectorized_data])

print('Average review length is {}'.format(avgLength))
print('Max review length is {}'.format(maxLength))
print('Min review length is {}'.format(minLength))

Average review length is 202.54368
Max review length is 2083
Min review length is 7


In [37]:
f10 = vectorized_data[:10]

In [41]:
[x for x in vec for vec in f10]

NameError: name 'vec' is not defined

In [None]:
[y for x in non_flat for y in x]


In [44]:
len(f10[0][0])

300

In [None]:
for review in f10:
    print(len(review))
    for tokenVec in review:
        print(' '+str(len(tokenVec)))

In [51]:
print(max([len(tokenVec) for review in f10 for tokenVec in review]))
print(min([len(tokenVec) for review in f10 for tokenVec in review]))

300
300


# Data Status 2
Data is now a list of lists of arrays
The outer list is 1 entry per movie review
The 1' list is 1 entry per token in the movie review*
The value of each entry in the 1' list is the 300 dimension vector from the word2Vec transforms

* assuming token was in the word2vec vectors we loaded up

In [52]:
split_point = int(len(vectorized_data)*.8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [53]:
print(len(x_train))
print(len(x_test))

20000
5000


In [56]:
len(x_train[1])

444

In [57]:
maxlen = 400
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
filters = 250           # Number of filters we will train
kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 250       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2              # Number of times we will pass the entire training dataset through the network

In [58]:
# Must manually pad/truncate

def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
 
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [60]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
# this reshape command made a 3 dimensional ndarray
# num docs x max length X w2v size
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [61]:
type(x_train)

numpy.ndarray

In [62]:
x_train.shape

(20000, 400, 300)

In [64]:
np.reshape?

In [42]:
print('Build model...')
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")
print('Model saved.')

W0726 15:39:47.361477 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 15:39:47.410978 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 15:39:47.437633 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0726 15:39:47.501786 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default 

Build model...


W0726 15:39:47.578506 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0726 15:39:47.605026 4418233792 deprecation_wrapper.py:119] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0726 15:39:47.612179 4418233792 deprecation.py:323] From /Users/ariedl/dev/pyvenv/nlpia-WGCXBMQL/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
Model saved.


In [43]:
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

model.load_weights('cnn_weights.h5')

In [44]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."

In [45]:
# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)

array([[0.26302156]], dtype=float32)

In [46]:
model.predict_classes(test_vec)

array([[0]], dtype=int32)