## Scope

This notebook goes through how to create a training set from a set of questions and answers

## Optional: further research
* **OOB for cleaning text SpaCy** https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49
* 1) BLUE SCORE FOR EVALUATION: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
* 2) Stanford SQuad (Question/Answer Scoring Datasets): https://github.com/obryanlouis/qa
* 3) https://github.com/facebookresearch/ParlAI
* 4) Seq2Seq: https://docs.chainer.org/en/stable/examples/seq2seq.html

## Initialize

In [1]:
# Data
import numpy as np

# Utility
import time
import math
import sys
import os
import glob
import pickle

# NLP
import tensorflow as tf
import re
import spacy


# Custom libraries
# from seq2seq_model import Seq2SeqModel
# from corpora_tools import *

## Import Data

In [2]:
def retrieve_corpora(path_to_pickle):
    data = pickle.load(open(path_to_pickle,'rb'))
    qa = []
    for i in list(data):
        qa.append((i[0]['utterance'],i[1]['utterance']))

    # Split pairs into question and answer
    question, answer = zip(*qa)
    return question,answer

def tokenize(txt):
    return str(txt).lower().split(' ')

def clean_sentences(txt):
    # tokenize
    tokenized_txt = tokenize(txt)
    # Filter comments too long
    return tokenized_txt[:20]

In [3]:
# Option 1: All responses are equal to the original query except for the comment by the author
# Zipped Q/A
question,answer = retrieve_corpora('../data/all_responses_equal.p')

In [4]:
idx = 1
print("Q:", question[idx])
print("A:", answer[idx])

Q: Husband deteriorating before my eyes, doctors at a loss, no one will help; Reddit docs, I need you.
A: I really do wish the best for your husband and hope you eventually get through this! keep your and your husband's head up and tell him he has people cheering for him sure, including me! :)


## Create Trainable dataset

In [5]:
%%time
# Clean sentences
clean_question = [clean_sentences(s) for s in question]
clean_answer = [clean_sentences(s) for s in answer]

print(len(clean_question))
assert len(clean_question) == len(clean_answer)

108825
CPU times: user 996 ms, sys: 76.1 ms, total: 1.07 s
Wall time: 1.14 s


In [96]:
idx = 2
print("Q:", clean_question[idx])
print("A:", clean_answer[idx])

Q: ['husband', 'deteriorating', 'before', 'my', 'eyes,', 'doctors', 'at', 'a', 'loss,', 'no', 'one', 'will', 'help;', 'reddit', 'docs,', 'i', 'need', 'you.']
A: ['a', 'few', 'weeks', 'ago', 'i', 'was', 'suffering', 'from', 'the', 'same', 'symptoms.', 'severe', 'groin', 'pain', 'that', 'radiated', 'through', 'my', 'testicles', 'and']


In [14]:
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i in range(len(sentences_l1)):
        if min_len <= len(sentences_l1[i]) <= max_len and \
                 min_len <= len(sentences_l2[i]) <= max_len:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
    return filtered_sentences_l1, filtered_sentences_l2

import data_utils

def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
    count_words = Counter()
    dict_words = {}
    opt_dict_size = len(data_utils.OP_DICT_IDS)
    for sen in sentences:
        for word in sen:
            count_words[word] += 1
    
    dict_words[data_utils._PAD] = data_utils.PAD_ID
    dict_words[data_utils._GO] = data_utils.GO_ID
    dict_words[data_utils._EOS] = data_utils.EOS_ID
    dict_words[data_utils._UNK] = data_utils.UNK_ID

    for idx, item in enumerate(count_words.most_common(dict_size)):
        dict_words[item[0]] = idx + opt_dict_size
    if storage_path:
        pickle.dump(dict_words, open(storage_path, "wb"))
    return dict_words

def sentences_to_indexes(sentences, indexed_dictionary):
    indexed_sentences = []
    not_found_counter = 0
    for sent in sentences:
        idx_sent = []
        for word in sent:
            try:
                idx_sent.append(indexed_dictionary[word])
            except KeyError:
                idx_sent.append(data_utils.UNK_ID)
                not_found_counter += 1
        indexed_sentences.append(idx_sent)
    
    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
    return indexed_sentences

def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):
    assert len(sentences_l1) == len(sentences_l2)
    data_set = []
    for i in range(len(sentences_l1)):
        padding_l1 = len_l1 - len(sentences_l1[i])
        pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]
        padding_l2 = len_l2 - len(sentences_l2[i])
        pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)
        data_set.append([pad_sentence_l1, pad_sentence_l2])
    return data_set

def extract_max_length(corpora):
    return max([len(sentence) for sentence in corpora])

In [98]:
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_question, 
          clean_answer)
print("# Filtered Corpora length (i.e. number of sentences)")
print(len(filt_clean_sen_l1))
assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)

# Filtered Corpora length (i.e. number of sentences)
108825


In [101]:
dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=15000, storage_path="/tmp/l1_dict.p")
dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=15000, storage_path="/tmp/l2_dict.p")
idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)
idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)
print("# Same sentences as before, with their dictionary ID")
print("Q:", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))
print("A:", list(zip(filt_clean_sen_l2[0], idx_sentences_l2[0])))

[sentences_to_indexes] Did not find 28670 words
[sentences_to_indexes] Did not find 98910 words
# Same sentences as before, with their dictionary ID
Q: [('husband', 456), ('deteriorating', 1371), ('before', 191), ('my', 5), ('eyes,', 923), ('doctors', 69), ('at', 49), ('a', 6), ('loss,', 617), ('no', 47), ('one', 94), ('will', 81), ('help;', 1333), ('reddit', 575), ('docs,', 829), ('i', 4), ('need', 36), ('you.', 1322)]
A: [('hey,', 1011), ("how's", 2487), ('your', 14), ('husband', 1356), ('doing', 256), ('now?', 1425), ('hope', 274), ('everything', 416), ('is', 11), ('okay.', 957)]


In [106]:
max_length_l1 = extract_max_length(idx_sentences_l1)
max_length_l2 = extract_max_length(idx_sentences_l2)
print("# Max sentence sizes:")
print("DE:", max_length_l1)
print("EN:", max_length_l2)

# Max sentence sizes:
DE: 20
EN: 20


As the final step, let's add paddings and markings to the sentences:

In [107]:
data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)
print("# Prepared minibatch with paddings and extra stuff")
print("Q:", data_set[0][0])
print("A:", data_set[0][1])
print("# The sentence pass from X to Y tokens")
print("Q:", len(idx_sentences_l1[0]), "->", len(data_set[0][0]))
print("A:", len(idx_sentences_l2[0]), "->", len(data_set[0][1]))

# Prepared minibatch with paddings and extra stuff
Q: [0, 0, 456, 1371, 191, 5, 923, 69, 49, 6, 617, 47, 94, 81, 1333, 575, 829, 4, 36, 1322]
A: [1, 1011, 2487, 14, 1356, 256, 1425, 274, 416, 11, 957, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# The sentence pass from X to Y tokens
Q: 18 -> 20
A: 10 -> 22


After we're done with the corpora, it's now time to work on the model. This project requires again a sequence to sequence model, therefore we can use an RNN. Even more, we can reuse part of the code from the previous project: we'd just need to change how the dataset is built, and the parameters of the model. We can then copy the training script built in the previous chapter, and modify the build_dataset function, to use the Cornell dataset.

Mind that the dataset used in this chapter is bigger than the one used in the previous, therefore you may need to limit the corpora to a few dozen thousand lines. On a 4 years old laptop with 8GB RAM, we had to select only the first 30 thousand lines, otherwise, the program ran out of memory and kept swapping. As a side effect of having fewer examples, even the dictionaries are smaller, resulting in less than 10 thousands words each.

In [39]:
from collections import Counter
import data_utils


def tokenize(txt):
    return str(txt).lower().split(' ')

def clean_sentence(txt):
    # tokenize
    tokenized_txt = tokenize(txt)
    # Filter comments too long
    return tokenized_txt[:20]

def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    import data_utils
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i in range(len(sentences_l1)):
        if min_len <= len(sentences_l1[i]) <= max_len and \
                 min_len <= len(sentences_l2[i]) <= max_len:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
    return filtered_sentences_l1, filtered_sentences_l2

def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
    count_words = Counter()
    dict_words = {}
    opt_dict_size = len(data_utils.OP_DICT_IDS)
    for sen in sentences:
        for word in sen:
            count_words[word] += 1
    
    dict_words[data_utils._PAD] = data_utils.PAD_ID
    dict_words[data_utils._GO] = data_utils.GO_ID
    dict_words[data_utils._EOS] = data_utils.EOS_ID
    dict_words[data_utils._UNK] = data_utils.UNK_ID

    for idx, item in enumerate(count_words.most_common(dict_size)):
        dict_words[item[0]] = idx + opt_dict_size
    if storage_path:
        pickle.dump(dict_words, open(storage_path, "wb"))
    return dict_words

def sentences_to_indexes(sentences, indexed_dictionary):
    indexed_sentences = []
    not_found_counter = 0
    for sent in sentences:
        idx_sent = []
        for word in sent:
            try:
                idx_sent.append(indexed_dictionary[word])
            except KeyError:
                idx_sent.append(data_utils.UNK_ID)
                not_found_counter += 1
        indexed_sentences.append(idx_sent)
    
    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
    return indexed_sentences

def extract_max_length(corpora):
    return max([len(sentence) for sentence in corpora])
        
def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):
    assert len(sentences_l1) == len(sentences_l2)
    data_set = []
    for i in range(len(sentences_l1)):
        padding_l1 = len_l1 - len(sentences_l1[i])
        pad_sentence_l1 = ([data_utils.PAD_ID]*padding_l1) + sentences_l1[i]
        padding_l2 = len_l2 - len(sentences_l2[i])
        pad_sentence_l2 = [data_utils.GO_ID] + sentences_l2[i] + [data_utils.EOS_ID] + ([data_utils.PAD_ID] * padding_l2)
        data_set.append([pad_sentence_l1, pad_sentence_l2])
    return data_set    

def build_dataset(use_stored_dictionary=False,path_to_data=''):
    sen_l1, sen_l2 = retrieve_corpora(path_to_data)
    clean_sen_l1 = [clean_sentence(s) for s in sen_l1][:30000] ### OTHERWISE IT DOES NOT RUN ON MY LAPTOP
    clean_sen_l2 = [clean_sentence(s) for s in sen_l2][:30000] ### OTHERWISE IT DOES NOT RUN ON MY LAPTOP
    filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, clean_sen_l2, max_len=10)
    

    if not use_stored_dictionary:
        dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=10000, storage_path=path_l1_dict)
        dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=10000, storage_path=path_l2_dict)
    else:
        dict_l1 = pickle.load(open(path_l1_dict, "rb"))
        dict_l2 = pickle.load(open(path_l2_dict, "rb"))
    dict_l1_length = len(dict_l1)
    dict_l2_length = len(dict_l2)

    idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)
    idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)

    max_length_l1 = extract_max_length(idx_sentences_l1)
    max_length_l2 = extract_max_length(idx_sentences_l2)
    data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)
    return (filt_clean_sen_l1, filt_clean_sen_l2), \
           data_set, \
           (max_length_l1, max_length_l2), \
           (dict_l1_length, dict_l2_length)

After a few iterations, you can stop the program and you'll see something similar to this output:

In [1]:
build_dataset(use_stored_dictionary=True,path_to_data='../data/all_responses_equal.p')

NameError: name 'build_dataset' is not defined