In [1]:
import pickle
import itertools
import numpy as np
from scipy import spatial
from scipy.stats import norm
import nltk.data
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import reuters
from nltk. corpus import gutenberg
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize
from gensim.models import KeyedVectors
from keras.layers import Input, Dense, Lambda, Layer, LSTM, Reshape, TimeDistributed, Dropout
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras import backend as K
from keras import metrics

from gensim.models.word2vec import Word2Vec
import pandas as pd
import re

Using TensorFlow backend.


# Preprocessing Text

The preprocessing code is data specific.  
  
It is an example of how one can use a pre-trained word2vec to embed sentences into a vector space.

# TODO train w2v model

In [2]:
dataset = pd.read_csv("jokes.csv")

In [3]:
sentences = (dataset["Question"] + " " + dataset["Answer"]).tolist()

In [8]:
dataset = pd.read_csv("shortjokes.csv")

In [9]:
sentences += dataset["Joke"].tolist()

In [10]:
def preprocess_text(text):
    return re.sub(
        r"[^\w\s]", 
        "", 
        text
    ).lower().split()

In [11]:
preprocessed_sentences = [preprocess_text(t) for t in sentences]

In [21]:
len(sentences)

269926

In [20]:
w2v_model = Word2Vec(preprocessed_sentences, size=100, window=10, workers=4, iter=100)

In [25]:
w2v_model.wv.most_similar(positive=["hooker"], topn=10)

[('prostitute', 0.8248806595802307),
 ('crabs', 0.5551421642303467),
 ('hookers', 0.5246987342834473),
 ('prostitutes', 0.5014286637306213),
 ('chick', 0.4723253548145294),
 ('lobster', 0.46344321966171265),
 ('woman', 0.4505305588245392),
 ('prostitue', 0.44164422154426575),
 ('rooster', 0.4328981935977936),
 ('whore', 0.4174628257751465)]

In [26]:
w2v = w2v_model.wv

In [27]:
def vectorize_sentence(sentence):
    concat_vector = []
    for word in sentence:
        try:
            concat_vector.append(w2v[word])
        except:
            pass
    return [a for vector in concat_vector for a in vector]

Preprocessing text from a variety of different sources.

In [30]:
vectorized_sentences = [vectorize_sentence(sentence) for sentence in preprocessed_sentences if len(sentence) < 15]

In [31]:
len(vectorized_sentences)

110440

In [32]:
original_dim = 1500

In [34]:
from keras.preprocessing import sequence
vectorized_padded = sequence.pad_sequences(vectorized_sentences, maxlen=original_dim, padding="post", truncating="post")

It's important to shuffle the text vectors before splitting them into test and train samples.   
  
This is done to avoid clumping text with similar context and style in the dataset because it can confuse the neural network during training.

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
vectorized_train, vectorized_test = train_test_split(vectorized_padded, test_size=0.3)

In [37]:
batch_size = 200
def cut_dataset(dataset, batch_size):
    rest = len(dataset) % batch_size
    return dataset[:-rest]

vectorized_train = cut_dataset(vectorized_train, batch_size)
vectorized_test = cut_dataset(vectorized_test, batch_size)

Get w2v embeddings for text with fixed length

# Variational Autoencoder

In [38]:
from keras import callbacks

In [39]:
latent_dim = 1000
intermediate_dim = 1200
lstm_intermediate_dim = 100
epochs = 200
epsilon_std = 1.0

In [40]:
x = Input(shape=(original_dim,))
h = Dense(intermediate_dim, activation='sigmoid')(x)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

In [41]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

In [42]:
# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='sigmoid')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

In [43]:
def vae_loss(y_true, y_pred):
    xent_loss = K.sum(K.binary_crossentropy(y_pred, y_true), axis=-1)
    kl_loss = 0.5 * K.sum(K.square(z_mean) + K.exp(z_log_var) - 1. - z_log_var, axis=-1)
    return xent_loss + kl_loss

In [44]:
vae = Model(x, x_decoded_mean)
vae.compile(optimizer='adam', loss=vae_loss)

In [None]:
#checkpoint
cp = [callbacks.ModelCheckpoint(filepath="/tmp/model.h5", verbose=1)]

#train
vae.fit(vectorized_train, vectorized_train,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(vectorized_test, vectorized_test),
        callbacks=cp)

Train on 77200 samples, validate on 33000 samples
Epoch 1/200

Epoch 00001: saving model to /tmp/model.h5
Epoch 2/200

Epoch 00002: saving model to /tmp/model.h5
Epoch 3/200

Epoch 00003: saving model to /tmp/model.h5
Epoch 4/200

Epoch 00004: saving model to /tmp/model.h5
Epoch 5/200

Epoch 00005: saving model to /tmp/model.h5
Epoch 6/200

Epoch 00006: saving model to /tmp/model.h5
Epoch 7/200

Epoch 00007: saving model to /tmp/model.h5
Epoch 8/200

Epoch 00008: saving model to /tmp/model.h5
Epoch 9/200

Epoch 00009: saving model to /tmp/model.h5
Epoch 10/200

Epoch 00010: saving model to /tmp/model.h5
Epoch 11/200

Epoch 00011: saving model to /tmp/model.h5
Epoch 12/200

Epoch 00012: saving model to /tmp/model.h5
Epoch 13/200

Epoch 00013: saving model to /tmp/model.h5
Epoch 14/200

Epoch 00014: saving model to /tmp/model.h5
Epoch 15/200

Epoch 00015: saving model to /tmp/model.h5
Epoch 16/200

Epoch 00016: saving model to /tmp/model.h5
Epoch 17/200

Epoch 00017: saving model to /tmp

In [74]:
# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

# build a generator that can sample from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(decoder_input)
_x_decoded_mean = decoder_mean(_h_decoded)
generator = Model(decoder_input, _x_decoded_mean)

# Generating Text From Latent Space

In [None]:
# some matrix magic
def sent_parse(sentence, mat_shape):
    data_concat = []
    word_vecs = vectorize_sentences(sentence)
    for x in word_vecs:
        data_concat.append(list(itertools.chain.from_iterable(x)))
    zero_matr = np.zeros(mat_shape)
    zero_matr[0] = np.array(data_concat)
    return zero_matr

In [49]:
def print_sentence_with_w2v(sent_vect):
    word_sent = ''
    tocut = sent_vect
    for i in range (int(len(sent_vect)/100)):
        word_sent += w2v.most_similar(positive=[tocut[:100]], topn=1)[0][0]
        word_sent += ' '
        tocut = tocut[100:]
    print(word_sent)

In [None]:
# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec

In [None]:
# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample

In [None]:
# input: two written sentences, VAE batch-size, dimension of VAE input
# output: the function embeds the sentences in latent-space, and then prints their generated text representations
# along with the text representations of several points in between them
def sent_2_sent(sent1,sent2, batch, dim):
    a = sent_parse([sent1], (batch,dim))
    b = sent_parse([sent2], (batch,dim))
    encode_a = encoder.predict(a, batch_size = batch)
    encode_b = encoder.predict(b, batch_size = batch)
    test_hom = hom_shortest(encode_a[0], encode_b[0], 5)
    
    for point in test_hom:
        p = generator.predict(np.array([point]))[0]
        print_sentence(p)

Printing sentences from the training set and comparing them with the original will test whether the custom print function works properly.

In [50]:
for i in range(0, 10):
    print(preprocessed_x_sentences[i])
    print_sentence_with_w2v(x_vectorized_padded[i])

['he', 'nearly', 'drown', 'in', 'his', 'own', 'tea', 'pee']
he nearly drown in his own tea pee the the the the the the the the the the the the 
['mycheexarphlexin']
the the the the the the the the the the the the the the the the the the the the 
['matt']
matt the the the the the the the the the the the the the the the the the the the 
['jeanluc', 'pickacard']
the the the the the the the the the the the the the the the the the the the the 
['a', 'bullet', 'doesnt', 'miss', 'harambe']
a bullet doesnt miss harambe the the the the the the the the the the the the the the the 
['he', 'was', 'having', 'a', 'midlife', 'crisis']
he was having a midlife crisis the the the the the the the the the the the the the the 
['one', 'shucks', 'between', 'fits']
one shucks between fits the the the the the the the the the the the the the the the the 
['kevin', 'durant', 'or', 'bernie', 'sanders']
kevin or bernie sanders the the the the the the the the the the the the the the the the 
['because', 'the', 'sh

The encoder takes the training set of sentence vectors (concatenanted word vectors) and embeds them into a lower dimensional vector space.

In [75]:
sent_encoded = encoder.predict(x_vectorized_padded[0:10])

The decoder takes the list of latent dimensional encodings from above and turns them back into vectors of their original dimension.

In [76]:
sent_decoded = generator.predict(sent_encoded)

In [80]:
print_sentence_with_w2v(sent_decoded[3])

brides equal purchase manage purchase purchase purchase purchase purchase um purchase purchase purchase purchase purchase purchase purchase purchase purchase purchase 


The encoder trained above embeds sentences (concatenated word vetors) into a lower dimensional space. The code below takes two of these lower dimensional sentence representations and finds five points between them. It then uses the trained decoder to project these five points into the higher, original, dimensional space. Finally, it reveals the text represented by the five generated sentence vectors by taking each word vector concatenated inside and finding the text associated with it in the word2vec used during preprocessing.

In [None]:
test_hom = shortest_homology(sent_encoded[3], sent_encoded[10], 5)
for point in test_hom:
    p = generator.predict(np.array([point]))[0]
    print_sentence_with_w2v(p)

The code below does the same thing, with one important difference. After sampling equidistant points in the latent space between two sentence embeddings, it finds the embeddings from our encoded dataset those points are most similar to. It then prints the text associated with those vectors.
  
This allows us to explore how the Variational Autoencoder clusters our dataset of sentences in latent space. It lets us investigate whether sentences with similar concepts or grammatical styles are represented in similar areas of the lower dimensional space.

In [None]:
test_hom = shortest_homology(sent_encoded[2], sent_encoded[1500], 20)
for point in test_hom:
    p = generator.predict(np.array([find_similar_encoding(point)]))[0]
    print_sentence_with_w2v(p)