In [24]:
from keras.models import Sequential
from keras.layers import Dense, Activation
import os
import json
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from pathlib import Path
import keras

import tensorflow as tf
print(tf.__version__)



1.0.1


In [2]:
tokenize = lambda x: simple_preprocess(x,min_len=1)
EOS = "$$"
UNK = "###"
DIM_SIZE=75
class SentenceGenerator(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        yield [UNK]
        for line in open(os.path.join(self.dirname)):
            who, what = line.split(":",1)
            t=tokenize(what.strip())
            t.append(EOS)
            yield t

class DialogGenerator(object):
    def __init__(self, dirname, who):
        self.dirname = dirname
        self.who = who

    def __iter__(self):
        last_what = None
        for line in open(os.path.join(self.dirname)):
            who, what = line.split(":",1)
            t=tokenize(what.strip())
            t.append(EOS)
            if who == self.who:
                yield (last_what,t)
            else:
                last_what=t

            
def create_embeddings(data_dir, output_path, **params):
    sentences = SentenceGenerator(data_dir)

    model = Word2Vec(sentences, **params)
    model.save(output_path)
    return model

def w2v(m, word):
    if word in m:
        return m[word]
    else:
        return m["UNK"]
    

def _batch(tokenized_dialog_lines, batch_size=10):
    batch = []

    for line in tokenized_dialog_lines:
        batch.append(line)
        if len(batch) == batch_size:
            yield batch
            batch = []
    


In [3]:
if Path("vocab.dat").is_file():
    m=Word2Vec.load("vocab.dat")
else:
    m=create_embeddings("dialog-parsed.txt","vocab.dat",size=DIM_SIZE, window=5, min_count=1, workers=4, iter=10, sg=1)

In [26]:
import seq2seq
from seq2seq.models import AttentionSeq2Seq
from seq2seq.models import SimpleSeq2Seq
from seq2seq.models import Seq2Seq

MAX_LEN=100

def sentence(words, max_len=20):
    pad_len= MAX_LEN - len(words)
    ret=[w2v(m.wv,x) for x in words[:MAX_LEN]]
    if pad_len > 0:
        padding = np.zeros(DIM_SIZE * pad_len)
        padding.shape = (pad_len, DIM_SIZE)
        ret = np.concatenate((ret,padding))
    return ret

keras.backend()._LEARNING_PHASE = True

model = Seq2Seq(input_dim=DIM_SIZE, input_length=MAX_LEN, hidden_dim=15, output_length=MAX_LEN, output_dim=DIM_SIZE, dropout=0.4, depth=3)
model.compile(loss='mse', optimizer='rmsprop', metrics=["accuracy"])

TypeError: 'module' object is not callable

In [19]:
for b in _batch(DialogGenerator("dialog-parsed.txt","SHELDON"),146*64):
    x=np.stack([sentence(i[0], max_len=MAX_LEN) for i in b])
    y=np.stack([sentence(i[1], max_len=MAX_LEN) for i in b])
    model.fit(x,y,batch_size=128, nb_epoch=3, verbose=1)

Epoch 1/3
Epoch 2/3
 640/9344 [=>............................] - ETA: 31s - loss: 0.0144 - acc: 0.0564

KeyboardInterrupt: 

In [7]:
response=model.predict(np.stack([sentence("hi there".split(" "))]))[0]

In [8]:
[m.similar_by_vector(r)[0][0] for r in response]


['overreacting',
 'overreacting',
 'overreacting',
 'overreacting',
 'cents',
 'cents',
 'cents',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'lean',
 'glamorous',
 'glamorous',
 'glamorous',
 'lunatics',
 'lunatics',
 'lunatics',
 'lunatics',
 'lunatics',
 'lunatics',
 'birds',
 'birds',
 'birds',
 'birds',
 'effects',
 'effects',
 'path',
 'which',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',
 'sheesh',

In [9]:
len([i for i in DialogGenerator("dialog-parsed.txt","SHELDON")])

9455

In [12]:
147*64

9408