In [12]:
from sklearn.metrics import accuracy_score

In [36]:
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np

def tokenize(sent):
    token = []
    for x in re.split('(\W+)?', sent):
        if x.strip():
            token.append(x.strip())
    return token

def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            supporting = map(int, supporting.split())
            substory = [story[i - 1] for i in supporting]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


def get_stories(file):
    data = parse_stories(file.readlines())
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    
    temp_data = []
    for story, q, answer in data:
        temp_data.append((flatten(story), q, answer))
    return temp_data


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    xs = []
    xqs = []
    ys = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        y = np.zeros(len(word_idx) + 1)
        y[word_idx[answer]] = 1
        xs.append(x)
        xqs.append(xq)
        ys.append(answer)
    return pad_sequences(xs, maxlen=story_maxlen), pad_sequences(xqs, maxlen=query_maxlen), np.array(ys)

# Download the dataset
try:
    path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
    print('Error downloading dataset, please download it manually:\n'
          '$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
          '$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
    raise

task = 'tasks_1-20_v1-2/en/qa5_three-arg-relations_{}.txt'

with tarfile.open(path) as tar:
    train = get_stories(tar.extractfile(task.format('train')))
    test = get_stories(tar.extractfile(task.format('test')))

vocab = set()
for story, q, answer in train + test:
    vocab |= set(story + q + [answer])
vocab = sorted(vocab)


vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
story_maxlen = max(map(len, (x for x, _, _ in train + test)))
query_maxlen = max(map(len, (x for _, x, _ in train + test)))

x, xq, y = vectorize_stories(train, word_idx, story_maxlen, query_maxlen)
tx, txq, ty = vectorize_stories(test, word_idx, story_maxlen, query_maxlen)

print('vocab = {}'.format(vocab))
print('x.shape = {}'.format(x.shape))
print('xq.shape = {}'.format(xq.shape))
print('y.shape = {}'.format(y.shape))
print('story_maxlen, query_maxlen = {}, {}'.format(story_maxlen, query_maxlen))

  return _compile(pattern, flags).split(string, maxsplit)


vocab = ['.', '?', 'Bill', 'Fred', 'Jeff', 'Mary', 'What', 'Who', 'apple', 'did', 'football', 'gave', 'give', 'handed', 'milk', 'passed', 'received', 'the', 'to']
x.shape = (1000, 7)
xq.shape = (1000, 8)
y.shape = (1000,)
story_maxlen, query_maxlen = 7, 8


In [29]:
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression()
LogReg.fit(np.append(x, xq,axis=1), y)
y_pred = LogReg.predict(np.append(tx, txq,axis=1))
accuracy_score(ty, y_pred, normalize=True)

0.667

In [4]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(np.append(x, xq,axis=1), y).predict(np.append(tx, txq,axis=1))
accuracy_score(ty, y_pred, normalize=True)

0.576

In [5]:
from sklearn.naive_bayes import BernoulliNB
bNB = BernoulliNB()
y_pred = bNB.fit(np.append(x, xq,axis=1), y).predict(np.append(tx, txq,axis=1))
accuracy_score(ty, y_pred, normalize=True)

0.294

In [6]:
from sklearn.naive_bayes import MultinomialNB
mNB = MultinomialNB()
y_pred = mNB.fit(np.append(x, xq,axis=1), y).predict(np.append(tx, txq,axis=1))
accuracy_score(ty, y_pred, normalize=True)

0.427

In [39]:
from sklearn import svm
clf = svm.SVC()
y_pred = clf.fit(np.append(x, xq,axis=1), y).predict(np.append(tx, txq,axis=1))
accuracy_score(ty, y_pred, normalize=True)

0.9