In [137]:
import pandas as pd
from keras.preprocessing import sequence
import re
from tqdm import tqdm_notebook as tqdm

In [138]:
dataset = pd.read_csv("jokes.csv")

In [139]:
sequence_length = 30

In [163]:
x_train = dataset["Answer"]
y_train = dataset["Question"]

In [164]:
def preprocess_text(text):
    return re.sub(
        r"[.,:?!\()[]-]", 
        "", 
        text
    ).lower().split()

In [165]:
def preprocess_texts(texts, vocabulary={}):
    preprocessed_texts = []
    for text in tqdm(texts):
        preprocessed_text = []
        for word in text:
            if word not in vocabulary.keys():
                vocabulary[word] = len(vocabulary) + 1
            preprocessed_text.append(vocabulary[word])
        preprocessed_texts.append(preprocessed_text)
    return preprocessed_texts, vocabulary

In [166]:
x_cleaned_texts = x_train.apply(preprocess_text)
x_preprocessed_texts, vocabulary = preprocess_texts(x_cleaned_texts)

HBox(children=(IntProgress(value=0, max=38269), HTML(value='')))




In [167]:
y_cleaned_texts = y_train.apply(preprocess_text)
y_preprocessed_texts, vocabulary = preprocess_texts(y_cleaned_texts, vocabulary)

HBox(children=(IntProgress(value=0, max=38269), HTML(value='')))




In [168]:
x_sequences = sequence.pad_sequences(x_preprocessed_texts, maxlen=sequence_length, padding="post", truncating="post")
y_sequences = sequence.pad_sequences(y_preprocessed_texts, maxlen=sequence_length, padding="post", truncating="post")

Encoder:
- Embeddings
- Dropout
Multiple filters
- Convolution1D
- MaxPooling1D
- Flatten
Concatenate
- Dropout
- Dense
- Deconvolution1D
- Reversed embeddings?
- Dense

Reversed embeddings:
- Input = vector
- Output = one-hot encoded word

In [169]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

In [299]:
input_shape = (sequence_length,)
embedding_dim = 50
dropout_prob = (0.5, 0.8, 0.5)
num_filters = 10
filter_size = 5
hidden_dims = 50

In [310]:
model_input = Input(shape=input_shape)
z = Embedding(len(vocabulary), embedding_dim, input_length=sequence_length, name="embedding")(model_input)
z = Dropout(dropout_prob[0])(z)
conv = Convolution1D(filters=num_filters,
                     kernel_size=filter_size,
                     padding="valid",
                     activation="relu",
                     strides=1)(z)
z = MaxPooling1D(pool_size=2)(z)
z = Flatten()(z)
z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)

In [311]:
def convolution(size, kernel_size, stride=1):
    return size - kernel_size + stride

def pooling(size, pool):
    return int(size/pool)

def deconvolution(size, kernel_size, stride=1):
    return size + kernel_size - stride

In [312]:
a = convolution(sequence_length, 5)
a = pooling(a, 2)
a = deconvolution(a, 3)
a = deconvolution(a, 5)
a = deconvolution(a, 6)
a = deconvolution(a, 7)
a

30

In [313]:
from keras.engine.topology import Layer
import keras.backend as K
from keras.layers import Lambda, Conv2DTranspose, Reshape

import keras.backend as K
from keras.layers import Conv2DTranspose


def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'):
    x = Lambda(lambda x: K.expand_dims(x, axis=2))(input_tensor)
    print(x.shape)
    x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x)
    x = Lambda(lambda x: K.squeeze(x, axis=2))(x)
    return x

In [314]:
deconv_parameters = [(3, 16), (3, 8), (3, 4), (3, 1)]

In [315]:
z = Reshape((hidden_dims, 1))(z)

In [316]:
for size, num_filters in deconv_parameters:
    print("Ok@")
    z = Conv1DTranspose(z, filters=num_filters, kernel_size=size)
z = Flatten()(z)
z = Dropout(dropout_prob[2])(z)
z = Dense(sequence_length, activation="relu")(z)
autoencoder = Model(model_input, z)

Ok@
(?, 50, 1, 1)
Ok@
(?, ?, 1, 16)
Ok@
(?, ?, 1, 8)
Ok@
(?, ?, 1, 4)


In [317]:
batch_size = 64
num_epochs = 10

In [318]:
for l in autoencoder.layers:
    print(l.name)

input_16
embedding
dropout_30
max_pooling1d_14
flatten_16
dropout_31
dense_16
reshape_15
lambda_53
conv2d_transpose_35
lambda_54
lambda_55
conv2d_transpose_36
lambda_56
lambda_57
conv2d_transpose_37
lambda_58
lambda_59
conv2d_transpose_38
lambda_60
flatten_17
dropout_32
dense_17


In [319]:
autoencoder.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
autoencoder.fit(x_sequences, y_sequences, batch_size=batch_size, epochs=num_epochs, verbose=2)

Epoch 1/10
 - 34s - loss: -3.2702e+04 - acc: 0.4758
Epoch 2/10


KeyboardInterrupt: 