### Train a bilstm to generate jokes in forward and reverse based on a controlled bit

In [1]:
import pickle
import importlib
from library import data_preprocess as dp
importlib.reload(dp)
import random
from time import time

import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Input
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM, Bidirectional
from keras.layers import Embedding, TimeDistributed, Flatten, Merge, Concatenate
from keras import regularizers
from keras.metrics import sparse_categorical_accuracy, sparse_categorical_crossentropy
from keras.models import load_model
from keras.optimizers import Adam
from keras.models import Model

from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from keras import backend as K
from keras.utils import multi_gpu_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Variables

In [2]:
# http://nlp.stanford.edu/data/glove.6B.zip
DATA_PATH = './datasets/combined.pickle'
VOCAB_PATH = './datasets/combined_vocabulary.pickle'
MODELS_PATH = './models/'

GLOVE_PATH = './data/glove/glove.6B.200d.txt'

MODEL_PREFIX = 'combined_controlled_stacked_lstm_glove'

MAX_SEQUENCE_LENGTH = 13
VALIDATION_SPLIT = 0.2

GLOVE_EMBEDDING_DIM = 200
EMBEDDING_DIM1 = 512
EMBEDDING_DIM2 = 512
HIDDEN_DIM1 = 1024
HIDDEN_DIM2 = 512
DEEPER_DIM = 512
DROPOUT_FACTOR = 0.2
REGULARIZATION = 0.00001
LEARNING_RATE = 0.003

DATA_PERCENT = 0.1

RUN_INDEX = 1

In [3]:
with open(DATA_PATH, 'rb') as pickleFile:
    sentences = pickle.load(pickleFile)

with open(VOCAB_PATH, 'rb') as pickleFile:
    vocab = pickle.load(pickleFile)
    
random.shuffle(sentences)

print("Number of sentences = ", len(sentences))
print(sentences[:2])
print("Vocab size = ", len(vocab))
print(vocab[:10])

Number of sentences =  270543
[[0, 'sos hillary and trump crash in a plane , who survives ? the united states eos'], [0, 'sos keep your friends close , and your enemies closer . marry your enemy . grow old together . watch your enemy die . eos']]
Vocab size =  12614
['liberate', 'savings', 'clip', 'rhino', 'cross', 'encouragement', 'viewed', 'womens', 'lap', 'challenges']


In [4]:
NUM_JOKES = len([0 for item in sentences if item[0] == 0])
NUM_QUOTES = len([1 for item in sentences if item[0] == 1])
NUM_TWEETS = len([2 for item in sentences if item[0] == 2])
print("Number of Jokes: ", NUM_JOKES)
print("Number of Quotes: ", NUM_QUOTES)
print("Number of Tweets: ", NUM_TWEETS)
TOTAL_SENTENCES = NUM_JOKES + NUM_QUOTES + NUM_TWEETS
JOKES_WEIGHT = TOTAL_SENTENCES/NUM_JOKES
QUOTES_WEIGHT = TOTAL_SENTENCES/NUM_QUOTES
TWEETS_WEIGHT = TOTAL_SENTENCES/NUM_TWEETS
print("JOKES_WEIGHT: ", JOKES_WEIGHT)
print("QUOTES_WEIGHT: ", QUOTES_WEIGHT)
print("TWEETS_WEIGHT: ", TWEETS_WEIGHT)

CLASS_WEIGHTS = [JOKES_WEIGHT, QUOTES_WEIGHT, TWEETS_WEIGHT]
CLASSES = [item[0] for item in sentences]
sentences = [item[1] for item in sentences]

Number of Jokes:  96910
Number of Quotes:  43383
Number of Tweets:  130250
JOKES_WEIGHT:  2.7916933237024044
QUOTES_WEIGHT:  6.236152409930157
TWEETS_WEIGHT:  2.07710556621881


In [5]:
# tokenize data
num_words = len(vocab)

with open("../combined_controlled_stacked_lstm_glove_tokenizer_1.pickle","rb") as fip:
    tokenizer = pickle.load(fip)
# tokenizer = Tokenizer(num_words=None, filters='', lower=True, split=' ', 
#                       char_level=False, oov_token=None)
tokenizer.fit_on_texts(sentences)
assert num_words == len(tokenizer.word_index)

encoded_sentences = tokenizer.texts_to_sequences(sentences)
print(encoded_sentences[:5])

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(VOCAB_SIZE)

[[1, 1549, 9, 641, 2013, 17, 7, 824, 8, 83, 5318, 13, 4, 1466, 1272, 2], [1, 248, 54, 184, 661, 8, 9, 54, 2597, 1795, 3, 1667, 54, 2159, 3, 866, 244, 514, 3, 286, 54, 2159, 442, 3, 2], [1, 152, 36, 59, 2073, 41, 59, 19, 426, 16, 407, 2], [1, 90, 5, 75, 6, 52, 6, 81, 90, 11, 4, 157, 640, 17, 565, 3, 179, 332, 18, 74, 55, 24, 8, 170, 255, 5, 291, 17, 7336, 8, 44, 4, 67, 7007, 2], [1, 5, 32, 352, 7, 129, 4081, 1194, 56, 51, 4573, 38, 1117, 514, 3, 18, 34, 9623, 16, 2]]
12615


In [6]:
# saving
with open(MODELS_PATH + MODEL_PREFIX + '_tokenizer_' + str(RUN_INDEX) + '.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Preparing Training Data

In [14]:
TAG_SIZE = 3
X_data = []
y_data = []
tag_data = []
weight_data = []

for idx, sentence in enumerate(encoded_sentences):
    l = len(sentence)
    sliding_window_length = min(l-3, MAX_SEQUENCE_LENGTH)
    step_size = 1
    for i in range(0, l - sliding_window_length, step_size):
        X_data.append(sentence[i:i+sliding_window_length])
        y_data.append(sentence[i+1:i+sliding_window_length+1])
        tag_data.append(CLASSES[idx])
        weight_data.append(CLASS_WEIGHTS[CLASSES[idx]])
        
print("Total training data size = ", len(X_data))
MAX_SEQ_LEN = max([len(seq) for seq in X_data])
print("Max seq len = ", MAX_SEQ_LEN)

X_data = pad_sequences(X_data, maxlen=MAX_SEQ_LEN, padding='pre')
y_data = pad_sequences(y_data, maxlen=MAX_SEQ_LEN, padding='pre').reshape(-1, MAX_SEQ_LEN, 1)
tag_data = to_categorical([[[tag]]*MAX_SEQ_LEN for tag in tag_data], TAG_SIZE)
weight_data = np.array(weight_data).reshape(-1,)
#y_data = np.array(y_data).reshape(-1,1)

# shuffle
perm = np.random.permutation(X_data.shape[0])
X_data = X_data[perm]
y_data = y_data[perm]
tag_data = tag_data[perm]
weight_data = weight_data[perm]
print(X_data.shape)
print(X_data[:5])
print(y_data.shape)
print(y_data[:5])
print(tag_data.shape)
print(tag_data[:5])
print(weight_data.shape)
print(weight_data[:5])

Total training data size =  2244907
Max seq len =  13
(2244907, 13)
[[  14  213  285   24    4  324    8   28   18  310  297   49  153]
 [   0    0    0    0   36   91  207  128  682   16   33   61   16]
 [   9    7 1085  865  493   13    7 1085  865  493   60 1421    7]
 [  21    6  123  452   15   10  214 7845  388    5  154  132  663]
 [2712    8 9635   20    5  105   73  163   92  131  927   39 1884]]
(2244907, 13, 1)
[[[ 213]
  [ 285]
  [  24]
  [   4]
  [ 324]
  [   8]
  [  28]
  [  18]
  [ 310]
  [ 297]
  [  49]
  [ 153]
  [ 862]]

 [[   0]
  [   0]
  [   0]
  [   0]
  [  91]
  [ 207]
  [ 128]
  [ 682]
  [  16]
  [  33]
  [  61]
  [  16]
  [   2]]

 [[   7]
  [1085]
  [ 865]
  [ 493]
  [  13]
  [   7]
  [1085]
  [ 865]
  [ 493]
  [  60]
  [1421]
  [   7]
  [ 240]]

 [[   6]
  [ 123]
  [ 452]
  [  15]
  [  10]
  [ 214]
  [7845]
  [ 388]
  [   5]
  [ 154]
  [ 132]
  [ 663]
  [  24]]

 [[   8]
  [9635]
  [  20]
  [   5]
  [ 105]
  [  73]
  [ 163]
  [  92]
  [ 131]
  [ 927]
  [  39]

In [13]:
print('Indexing glove word vectors')
#Glove Vectors
glove_embeddings_index = {}
f = open(GLOVE_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(glove_embeddings_index))

Indexing glove word vectors
Total 400000 word vectors.


In [14]:
print('Preparing glove embedding matrix')
glove_embedding_matrix = np.zeros((VOCAB_SIZE, GLOVE_EMBEDDING_DIM))
for word,i in tokenizer.word_index.items():
    embedding_vector = glove_embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        glove_embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(glove_embedding_matrix, axis=1) == 0))
print(glove_embedding_matrix.shape)

Preparing glove embedding matrix
Null word embeddings: 298
(12615, 200)


In [15]:
from keras.layers import merge
from keras.layers.core import Lambda
from keras.models import Model

import tensorflow as tf

def make_parallel(model, gpu_count):
    def get_slice(data, idx, parts):
        shape = tf.shape(data)
        size = tf.concat([ shape[:1] // parts, shape[1:] ],axis=0)
        stride = tf.concat([ shape[:1] // parts, shape[1:]*0 ],axis=0)
        start = stride * idx
        return tf.slice(data, start, size)

    outputs_all = []
    for i in range(len(model.outputs)):
        outputs_all.append([])

    #Place a copy of the model on each GPU, each getting a slice of the batch
    for i in range(gpu_count):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('tower_%d' % i) as scope:

                inputs = []
                #Slice each input into a piece for processing on this GPU
                for x in model.inputs:
                    input_shape = tuple(x.get_shape().as_list())[1:]
                    slice_n = Lambda(get_slice, output_shape=input_shape, arguments={'idx':i,'parts':gpu_count})(x)
                    inputs.append(slice_n)                

                outputs = model(inputs)
                
                if not isinstance(outputs, list):
                    outputs = [outputs]
                
                #Save all the outputs for merging back together later
                for l in range(len(outputs)):
                    outputs_all[l].append(outputs[l])

    # merge outputs on CPU
    with tf.device('/cpu:0'):
        merged = []
        for outputs in outputs_all:
            merged.append(merge(outputs, mode='concat', concat_axis=0))
            
        return Model(input=model.inputs, output=merged)

In [40]:
# define model
def StackedLSTM(vocab_size, glove_embedding_dim, glove_embedding_matrix, embedding_dim1, embedding_dim2,
           hidden_dim1, hidden_dim2, deeper_dim, max_seq_len, tag_size,
           dropout_factor=0.5, regularization=0.00001, learning_rate=0.001):
    
    inputs = Input(shape=(None,))
    tag_inputs = Input(shape=(None,tag_size))
    
    glove_embedding = Embedding(vocab_size, glove_embedding_dim, #input_length=max_seq_len,
                                  weights=[glove_embedding_matrix],
                                  mask_zero=True,trainable=False)(inputs)
    
    word_embedding = Embedding(vocab_size, embedding_dim1, #input_length=max_seq_len, 
                               mask_zero=True, embeddings_regularizer=regularizers.l2(regularization))(inputs)
    
    #tag_embedding = Embedding(tag_size, tag_size, embeddings_regularizer=regularizers.l2(regularization))(tag_inputs)
    
    concat_embeds = Concatenate(axis=-1)([glove_embedding, word_embedding, tag_inputs])
    
    final_embed = Dense(units=embedding_dim2, activation='tanh',
                        kernel_regularizer=regularizers.l2(regularization))(concat_embeds)
    
    lstm1 = LSTM(hidden_dim1, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(final_embed)
    
    lstm2 = LSTM(hidden_dim2, activation='tanh', 
                   kernel_regularizer=regularizers.l2(regularization), 
                   recurrent_regularizer=regularizers.l2(regularization), #unroll=True, 
                   return_sequences = True, dropout=dropout_factor, recurrent_dropout=dropout_factor)(lstm1)
    
    timedist_dropout = TimeDistributed(Dropout(dropout_factor))(lstm2)
    
    deep_dense = Dense(units=deeper_dim, activation='tanh', 
                       kernel_regularizer=regularizers.l2(regularization))(timedist_dropout)
    
    dropout_layer1 = Dropout(dropout_factor)(deep_dense)
    
    outputs = Dense(units=vocab_size, activation='softmax', 
                    kernel_regularizer=regularizers.l2(regularization))(dropout_layer1)
    
    model = Model(inputs=[inputs,tag_inputs], outputs=outputs)
    #model = make_parallel(Model(inputs=[inputs,tag_inputs], outputs=outputs), 2)
    #model = multi_gpu_model(model, gpus=2)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate),
                  metrics=[sparse_categorical_crossentropy, sparse_categorical_accuracy]#, sample_weight_mode='temporal'
                 )
    return model

In [41]:
K.clear_session()
config = tf.ConfigProto(allow_soft_placement=True)
sess = tf.Session(config=config)
K.set_session(sess)

model = StackedLSTM(vocab_size=VOCAB_SIZE, glove_embedding_dim=GLOVE_EMBEDDING_DIM,
                    glove_embedding_matrix=glove_embedding_matrix, 
                    embedding_dim1=EMBEDDING_DIM1, embedding_dim2=EMBEDDING_DIM2,
                    hidden_dim1=HIDDEN_DIM1, hidden_dim2=HIDDEN_DIM2,
                    deeper_dim=DEEPER_DIM, max_seq_len=MAX_SEQ_LEN, dropout_factor=DROPOUT_FACTOR, 
                    regularization=REGULARIZATION, learning_rate=LEARNING_RATE, tag_size=TAG_SIZE)
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    2523000     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 512)    6458880     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 3)      0                                            
__________________________________________________________________________________________________
concatenat

In [42]:
class TB(TensorBoard):
    def __init__(self, log_every=1, **kwargs):
        super().__init__(**kwargs)
        self.log_every = log_every
        self.counter = 0
    
    def on_batch_end(self, batch, logs=None):
        self.counter+=1
        if self.counter%self.log_every==0:
            for name, value in logs.items():
                if name in ['batch', 'size']:
                    continue
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.writer.add_summary(summary, self.counter)
            self.writer.flush()
        
        super().on_batch_end(batch, logs)

In [None]:
start_time = time()
tensorboard = TB(log_dir="./logs/" + MODEL_PREFIX + "/{}".format(time()), 
                          histogram_freq=0, write_graph=True, write_images=False, log_every=10)

callbacks=[tensorboard, 
           EarlyStopping(patience=5, monitor='val_loss'),
           ModelCheckpoint(filepath=MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen' + str(RUN_INDEX) + '.{epoch:02d}-{val_loss:.2f}.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1), 
           ModelCheckpoint(filepath=MODELS_PATH + MODEL_PREFIX + '_gen'+str(RUN_INDEX)+'.hdf5', 
                           monitor='val_loss', verbose=1, mode='auto', period=1, save_best_only=True)]

model.fit([X_data, tag_data], y_data, epochs=10, batch_size=1024, shuffle=True, 
          verbose=1, validation_split=0.2, callbacks=callbacks, sample_weight=weight_data)

print("Total elapsed time: ", time()-start_time)

Train on 1795925 samples, validate on 448982 samples
Epoch 1/10

Epoch 00001: saving model to ./models/checkpoints/combined_controlled_stacked_lstm_glove_gen1.01-15.35.hdf5

Epoch 00001: val_loss improved from inf to 15.35020, saving model to ./models/combined_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 2/10

Epoch 00002: saving model to ./models/checkpoints/combined_controlled_stacked_lstm_glove_gen1.02-14.68.hdf5

Epoch 00002: val_loss improved from 15.35020 to 14.68375, saving model to ./models/combined_controlled_stacked_lstm_glove_gen1.hdf5
Epoch 3/10
  10240/1795925 [..............................] - ETA: 42:08 - loss: 15.1677 - sparse_categorical_crossentropy: 4.1939 - sparse_categorical_accuracy: 0.2334

In [7]:
#model.save_weights(MODELS_PATH + 'checkpoints/'+ MODEL_PREFIX + '_gen' + str(RUN_INDEX)+'_epoch1.hdf5')
model = load_model('../combined_controlled_stacked_lstm_glove_gen1.hdf5')

In [12]:
# generate a sequence from a language model
def generate_categorical(model, tokenizer, seed_text, maxlen, probabilistic=False, exploration_factor=1.0, tag=0):
    
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    seq = tokenizer.texts_to_sequences([seed_text])[0]
    print(seq)
    
    while True:
        encoded_seq = seq
        if len(seq) > MAX_SEQUENCE_LENGTH:
            encoded_seq = encoded_seq[-1*MAX_SEQUENCE_LENGTH:]
            
        #padded_seq = pad_sequences([encoded_seq], maxlen=MAX_SEQ_LEN, padding='pre')
        padded_seq = np.array([seq])
        tags = to_categorical(np.full((1, padded_seq[0].shape[0]), tag), TAG_SIZE)
        y_prob = model.predict([padded_seq,tags])[0][-1].reshape(1,-1)#[3:].reshape(-1,1)
        
        if np.random.rand() <= exploration_factor:
            probabilistic = True
        else:
            probabilistic = False
            
        if probabilistic:
            y_class = np.argmax(np.random.multinomial(1,y_prob[0]/(np.sum(y_prob[0])+1e-5),1))
        else:
            y_class = y_prob.argmax(axis=-1)[0]
        
        if y_class == 0:
            break
        out_word = reverse_word_map[y_class]
        seq.append(y_class)
        if out_word == 'eos' or len(seq) > maxlen or out_word == 'sos':
            break
    
    words = [reverse_word_map[idx] for idx in seq]
    
    return ' '.join(words)

In [17]:
joke = generate_categorical(model, tokenizer, "sos a doctor told the boy ", maxlen=40, tag=2, exploration_factor=0.3)
print(joke)

[1, 7, 397, 251, 4, 403]
sos a doctor told the boy i was going to get appointment today . i miss him so much . eos


In [18]:
random = np.random.randint(0,len(sentences),100)
random.shape

(100,)

In [65]:
def unigram_list(sentence):
    return sentence.split(' ')
def bigrams_list(sentence):
    words = sentence.split(' ')
    bigrams = []
    for i in range(0, len(words)-1):
        bigrams.append(words[i]+' '+words[i+1])
    return bigrams

print(bigrams_list("sos hello , i'm a dinosaur . eos"))
def trigrams_list(sentence):
    words = sentence.split(' ')
    trigrams = []
    for i in range(0, len(words)-2):
        trigrams.append(words[i]+' '+words[i+1]+' '+words[i+2])
    return trigrams

print(trigrams_list("sos hello , i'm a dinosaur . eos"))
def fourgrams_list(sentence):
    words = sentence.split(' ')
    fourgrams = []
    for i in range(0, len(words)-3):
        fourgrams.append(words[i]+' '+words[i+1]+' '+words[i+2]+words[i+3])
    return fourgrams

print(fourgrams_list("sos hello , i'm a dinosaur . eos"))

def phraseOverlap(sent1,sent2,flist):
    score = 0
    for i in range(0,len(flist),1):
        l1 = flist[i](sent1)
        l2 = set(flist[i](sent2))
        for l in l1:
            if l in l2:
                #print(l)
                score += (i+1)**2
        #print(score)
                
    return np.tanh(float(score)/(len(sent1)+len(sent2)))

['sos hello', 'hello ,', ", i'm", "i'm a", 'a dinosaur', 'dinosaur .', '. eos']
['sos hello ,', "hello , i'm", ", i'm a", "i'm a dinosaur", 'a dinosaur .', 'dinosaur . eos']
["sos hello ,i'm", "hello , i'ma", ", i'm adinosaur", "i'm a dinosaur.", 'a dinosaur .eos']


In [62]:
flist = [unigram_list,bigrams_list,trigrams_list,fourgrams_list]

In [63]:
phraseOverlap("this is good stuff","is this good stuff",flist)

4
8
8
8


0.21863508368712128

sos what do you do when you lose your voice ? i don't know what to do threedots then hit me before it ever hurt eos ==> voice ? i don't know what to do threedots then hit me before it ever hurt eos
sos what do you do when you lose your === sos what do you do when you lose your voice ? i don't know what to do threedots then hit me before it ever hurt eos
0.7979789325668545 sos i don't know what to do threedots i love him eos
sos i'm what you might call ' internet pretty ' threedots eos ==> ' threedots eos
sos i'm what you might call ' internet pretty === sos i'm what you might call ' internet pretty ' threedots eos
0.2744686218100928 sos i want to listen to something ' sexy ' threedots eos
sos how many feminists does it take to change a lightbulb ? i am in new home . i fills it all alone eos ==> a lightbulb ? i am in new home . i fills it all alone eos
sos how many feminists does it take to change === sos how many feminists does it take to change a lightbulb ? i am in new home . i fills

0.4236843241877823 sos ugh . i don't want to go to work ! eos
sos at home , watching nothing . i had so much fun with it all day again my poor little guy looks pressure . eos ==> i had so much fun with it all day again my poor little guy looks pressure . eos
sos at home , watching nothing . === sos at home , watching nothing . i had so much fun with it all day again my poor little guy looks pressure . eos
0.7039056039366212 sos i had so much fun with my friends today ! eos
sos god has no phone , but i talk to him . he has no facebook , but he can see me not looking forward . eos ==> can see me not looking forward . eos
sos god has no phone , but i talk to him . he has no facebook , but he === sos god has no phone , but i talk to him . he has no facebook , but he can see me not looking forward . eos
0.3759622303493606 sos i'm up i have to go to graduation watch my friends leave me not looking forward to it eos
sos what is a mattress ' but i can't draw it ! cat , dishes , and i don't wan

In [125]:
np.mean(np.array(score_list))

0.49642574464227296

In [66]:
joke = "sos i love the music that i love because i love it . i can't tell you how to live without it . eos"
seed = "sos i love"
#phraseOverlap(joke,sent)
n = len(joke)
score = 0
print(seed,"===",joke)
for sent in sentences:
#         if sent.find(seed) != -1:
#             continue
    newscore = phraseOverlap(joke,sent,flist)
    if newscore >= score:
        score = newscore
        match_joke = sent
    
    if newscore >= 0.9:
        print(sent,phraseOverlap(joke,sent,flist))
print(score,match_joke)
score_list.append(score)

sos i love === sos i love the music that i love because i love it . i can't tell you how to live without it . eos
0.668606699416691 sos i am always training because i love it . eos


In [120]:
def k_jaccard(sent1,sent2,k):
    
    w_1 = sent1.split()
    w_2 = sent2.split()
    
    w_1_list = []
    for i in range(len(w_1)-k+1):
        w_1_list.append(' '.join(w_1[i:i+k]))
        
    w_2_list = []
    for i in range(len(w_2)-k+1):
        w_2_list.append(' '.join(w_2[i:i+k]))
        
    #print(w_1_list)
    #print(w_2_list)
    w_2_set = set(w_2_list)
    w_1_set = set(w_1_list)
    #print(len(w_2_set.union(w_1_set)),len(w_2_set.intersection(w_1_set)))
    return len(w_1_set.intersection(w_2_set))*1.0/(len(w_2_set.union(w_1_set)))
        
    

In [117]:
k_jaccard('sos what do you call a vegetarian ? a rip - off . eos','sos what do you call a cheap circumcision ? a rip - off . eos', 2)

['sos what', 'what do', 'do you', 'you call', 'call a', 'a vegetarian', 'vegetarian ?', '? a', 'a rip', 'rip -', '- off', 'off .', '. eos']
['sos what', 'what do', 'do you', 'you call', 'call a', 'a cheap', 'cheap circumcision', 'circumcision ?', '? a', 'a rip', 'rip -', '- off', 'off .', '. eos']
16 11


0.6875

In [74]:
joke_list = []
for i in random[:50]:
    sent = sentences[i]
    seed_list = sent.split()
    seed = seed_list[:int(len(seed_list)/3)]
    joke = generate_categorical(model, tokenizer,' '.join(seed), maxlen=40, tag=2, exploration_factor=0.1)
    joke_list.append((' '.join(seed),joke))

[1, 7, 1234, 8, 63, 2969, 8, 9, 7, 386]
[1, 31, 165, 166, 7981, 10, 427, 3197, 110]
[1, 51, 172, 275, 38]
[1, 5, 941, 7]
[1, 250, 585, 39, 264]
[1, 50, 38, 218, 66, 25, 61, 36, 1677, 13]
[1, 36, 4, 343, 897, 8, 5, 352, 7, 1276, 204]
[1, 5, 47, 461, 25, 20, 31, 92]
[1, 68, 291, 17, 7, 67, 169, 647, 11, 33, 7, 1658, 145]
[1, 27, 14, 35, 10, 2575]
[1, 75, 6, 205]
[1, 64, 6, 313, 6, 408, 229, 74, 4]
[1, 14, 146, 101, 1667]
[1, 46, 347, 51, 67]
[1, 49, 2039, 376, 10597]
[1, 49, 5, 171, 563, 7]
[1, 1471, 50, 28, 1593, 31, 534, 56, 137, 133, 55]
[1, 169, 4]
[1, 146, 14, 111, 348, 132, 544, 49]
[1, 31, 333, 3, 83, 361]
[1, 631, 158, 9, 218]
[1, 7, 1027, 15, 4910, 11, 360, 7, 2092]
[1, 10, 434, 3693]
[1, 10, 253, 1240, 22, 63, 1248, 1028, 71, 8, 1214]
[1, 5, 589, 41, 10, 1472, 19, 7]
[1, 49, 14, 1087, 37, 406, 17, 7]
[1, 463, 67, 3, 3, 3]
[1, 9680, 3709, 48, 15, 4, 418, 4660]
[1, 18, 85, 120, 285]
[1, 7, 61, 872]
[1, 755, 6, 1094, 385, 25, 31, 17]
[1, 49, 153, 33, 738, 6, 4105]
[1, 66, 74, 340,

In [68]:
import re
parser_sent = []

In [82]:
joke_list = []
for i in random[:50]:
    sent = sentences[i]
    seed_list = sent.split()
    seed = seed_list[:int(len(seed_list)/2)]
    joke = generate_categorical(model, tokenizer,' '.join(seed), maxlen=40, tag=0, exploration_factor=0.5)
    quote = generate_categorical(model, tokenizer,' '.join(seed), maxlen=40, tag=1, exploration_factor=0.5)
    tweet = generate_categorical(model, tokenizer,' '.join(seed), maxlen=40, tag=2, exploration_factor=0.5)
    joke_list.append((' '.join(seed),joke,quote,tweet))

[1, 7, 1234, 8, 63, 2969, 8, 9, 7, 386, 8531, 307, 106, 4, 230]
[1, 7, 1234, 8, 63, 2969, 8, 9, 7, 386, 8531, 307, 106, 4, 230]
[1, 7, 1234, 8, 63, 2969, 8, 9, 7, 386, 8531, 307, 106, 4, 230]
[1, 31, 165, 166, 7981, 10, 427, 3197, 110, 1512, 73, 404, 73, 4]
[1, 31, 165, 166, 7981, 10, 427, 3197, 110, 1512, 73, 404, 73, 4]
[1, 31, 165, 166, 7981, 10, 427, 3197, 110, 1512, 73, 404, 73, 4]
[1, 51, 172, 275, 38, 81, 17, 2231]
[1, 51, 172, 275, 38, 81, 17, 2231]
[1, 51, 172, 275, 38, 81, 17, 2231]
[1, 5, 941, 7, 502, 74, 4830]
[1, 5, 941, 7, 502, 74, 4830]
[1, 5, 941, 7, 502, 74, 4830]
[1, 250, 585, 39, 264, 3, 208, 6]
[1, 250, 585, 39, 264, 3, 208, 6]
[1, 250, 585, 39, 264, 3, 208, 6]
[1, 50, 38, 218, 66, 25, 61, 36, 1677, 13, 4, 4733, 4719, 15, 4]
[1, 50, 38, 218, 66, 25, 61, 36, 1677, 13, 4, 4733, 4719, 15, 4]
[1, 50, 38, 218, 66, 25, 61, 36, 1677, 13, 4, 4733, 4719, 15, 4]
[1, 36, 4, 343, 897, 8, 5, 352, 7, 1276, 204, 586, 4, 147, 349, 3892]
[1, 36, 4, 343, 897, 8, 5, 352, 7, 1276, 204,

In [83]:
joke_list

[('sos a priest , an astronaut , and a hair dresser walk into the bar',
  'sos a priest , an astronaut , and a hair dresser walk into the bar . the bartender says " hey i don\'t care . " eos',
  'sos a priest , an astronaut , and a hair dresser walk into the bar and walk with a blank sheet . eos',
  "sos a priest , an astronaut , and a hair dresser walk into the bar . i don't have to go to work . eos"),
 ("sos i'm sad after discovering my awesome headphones only reach as far as the",
  "sos i'm sad after discovering my awesome headphones only reach as far as the determination was in its hands . if i didn't show up and got a good job , i wasn't too good . eos",
  "sos i'm sad after discovering my awesome headphones only reach as far as the camera and the camera on my films . eos",
  "sos i'm sad after discovering my awesome headphones only reach as far as the same as i are that no good will i have to wake up out of her soul but im not realizing threedots eos"),
 ('sos how many days are 

In [85]:
with open('evaluate_jokes.txt','w') as fop:
    for joke in joke_list:
        fop.write(joke[0]+"\t"+joke[1]+"\t"+joke[2]+"\n")

In [77]:
for joke in joke_list:
    print(joke[1])
    temp = joke[1]
    temp = temp.replace("sos ","").replace(" eos","")`
    temp = re.split("[.]+",temp)
    for t in temp:
        sent = (((t.strip().capitalize()+".").replace("?.","?")).replace("\"","")).replace(" i "," I ")
        if len(sent) >= 3:
            print("sent: ",sent)
            parser_sent.append(sent)

sos a priest , an astronaut , and a hair . i don't know what to do . i don't want to go to work eos
sent:  A priest , an astronaut , and a hair.
sent:  I don't know what to do.
sent:  I don't want to go to work.
sos i'm sad after discovering my awesome headphones only it's not working . i don't want to go to school tomorrow eos
sent:  I'm sad after discovering my awesome headphones only it's not working.
sent:  I don't want to go to school tomorrow.
sos how many days are you ? i don't know what to do with my life . eos
sent:  How many days are you ? I don't know what to do with my life.
sos i needed a new phone . i don't want to go threedots i hate it . i hate it . eos
sent:  I needed a new phone.
sent:  I don't want to go threedots I hate it.
sent:  I hate it.
sos watching x - men origins : wolverine , and the movie awards . i love the movie awards ! eos
sent:  Watching x - men origins : wolverine , and the movie awards.
sent:  I love the movie awards !.
sos why are black people so go

In [78]:
print(len(parser_sent))
for p in parser_sent:
    print(p)

345
A priest , an astronaut , and a hair walk into a bar.
The bartender says ,  what is this , some kind of joke ? .
I'm sad after discovering my awesome headphones only to lead my life to the middle of the night.
How many days are you ?   no , i'm not going to get a job.
I needed a new job threedots I didn't know what to do with my hands.
Watching x - men origins :  I don't know , but I don't know why.
Why are black people so good at basketball ? because they always get a kick out of the crowd.
At the coffee shop , I saw a german guy in the restaurant last night.
He was a good guy.
I don't drink so that i'm more likely to be a man than I am.
We live in a time where government is not a leadership thing , but it is a shame to be a crime.
Do you like my suit ?   I dunno , but I don't know how to get it.
Want to hear a joke about lentil ? nevermind , I don't know , I just think it's funny.
Going to try to stay away from the internet , but I don't want to be a celebrity.
You should never m

In [None]:
#process joke list from parser

In [145]:
#score = []#sos i love my job as a road worker . i don't know what he laced them with but i've been tripping all day . eos
score_list = []
for seed,joke in joke_list[:30]:
    n = len(joke)
    joke_rep = ' '.join(joke.split()[len(seed.split()):])
    print(joke+" ==> "+joke_rep)
    print(seed,"===",joke)
    score = 0
    for sent in sentences:
        newscore = phraseOverlap(joke_rep,sent,flist)
        if newscore >= score:
            score = newscore
            match_joke = sent
    if newscore >= 0.9:
        print(sent,phraseOverlap(joke_rep,sent,flist))
    print(score,match_joke)
    
    #print(score)
    score_list.append(score)
np.mean(np.array(score_list))

sos what do you do when you lose your voice ? i don't know if you can get it . eos ==> voice ? i don't know if you can get it . eos
sos what do you do when you lose your === sos what do you do when you lose your voice ? i don't know if you can get it . eos
0.7401539091275828 sos i don't know if you can change things , but it's a drop in the ocean . eos
sos i'm what you might call ' internet pretty ' threedots i don't know what to do threedots eos ==> ' threedots i don't know what to do threedots eos
sos i'm what you might call ' internet pretty === sos i'm what you might call ' internet pretty ' threedots i don't know what to do threedots eos
0.914532364949822 sos damn i miss him and i don't know what to do threedots eos
sos how many feminists does it take to change a light bulb ? i don't know what to do . eos ==> a light bulb ? i don't know what to do . eos
sos how many feminists does it take to change === sos how many feminists does it take to change a light bulb ? i don't know what 

0.8080256920461416

In [146]:
#score = []#sos i love my job as a road worker . i don't know what he laced them with but i've been tripping all day . eos
score_list = []
for seed,joke in joke_list[:20]:
    n = len(joke)
    joke_rep = ' '.join(joke.split()[len(seed.split()):])
    print(joke+" ==> "+joke_rep)
    print(seed,"===",joke)
    score = 0
    for sent in sentences:
        newscore = k_jaccard(joke_rep,sent,4)
        if newscore >= score:
            score = newscore
            match_joke = sent
    if newscore >= 0.9:
        print(sent,k_jaccard(joke_rep,sent,4))
    print(score,match_joke)
    
    #print(score)
    score_list.append(score)
np.mean(np.array(score_list))

sos what do you do when you lose your voice ? i don't know if you can get it . eos ==> voice ? i don't know if you can get it . eos
sos what do you do when you lose your === sos what do you do when you lose your voice ? i don't know if you can get it . eos
0.13636363636363635 sos i don't know if you can change things , but it's a drop in the ocean . eos
sos i'm what you might call ' internet pretty ' threedots i don't know what to do threedots eos ==> ' threedots i don't know what to do threedots eos
sos i'm what you might call ' internet pretty === sos i'm what you might call ' internet pretty ' threedots i don't know what to do threedots eos
0.38461538461538464 sos damn i miss him and i don't know what to do threedots eos
sos how many feminists does it take to change a light bulb ? i don't know what to do . eos ==> a light bulb ? i don't know what to do . eos
sos how many feminists does it take to change === sos how many feminists does it take to change a light bulb ? i don't know wh

0.2617494964186141