In [None]:
from my_utils import *
import os
import pickle
import collections
import codecs
working_dir = os.getcwd()
data_dir = os.path.join(working_dir, "data")
glove_dir = os.path.join(data_dir, "glove")
train_data = os.path.join(data_dir, "train-v1.1.json")
test_data = os.path.join(data_dir, "test-v1.1.json")

In [None]:
data = LoadJsonData(train_data)["data"]

In [3]:
glove_dim = 100
glove_embedding = LoadGloveEmbedding(glove_dir, glove_dim)

In [36]:
para_dict, para_to_qa_dict, qa_data_dict = ParseJsonData(data)

#para_dict = {paraID: [list of words in paragraph]}
#para_to_qa_dict = {paraID: [list of qaIDs associated with paragraph]}
#qa_data_dict = {qaID: (paraID, list of words in question, answer_start_index, answer_end_index)}
para_glove_dict = GetGloveRepresentation(para_dict, glove_embedding, 100)

In [7]:
batch_size = 10

In [None]:
#for batch in all_batches:
    #max_para_length = max(map(lambda paraID: len(para_dict[paraID]), batch)) #length of longest paragraph in this batch
    #pad all shorter paragraphs to max_para_length, with special token '<pad>'

In [None]:
"""Data Pipeline
during testing:
-load batches of data as WORDS, padded with special token
-for words that are in vocab (we have a learned embedding), retrieve the associated INTEGER, 
and use INTEGER to do embedding_lookup. Then see if word has GLoVE embedding. If yes, add, if no add random.
-for words that are not in vocab, replace with INTEGER 0. using 0 for embedding_lookup to retrieve
the learned representation for general OOV words. If word has GLoVE embedding add, if no add zeros.
"""

In [44]:
class DataLoader():
    """Preprocesses data, creates batches, and provides the next batch of data"""
    def __init__(self, data_dir, batch_size, encoding="utf-8", training=True):
        if training:
            self.train_or_test = "train"
        else:
            self.train_or_test  ="test"
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.encoding = encoding
        self.all_txt_file = os.path.join(data_dir, "all_text.txt")
        self.vocab_file = os.path.join(data_dir, "vocab.pkl")
        self.integers_words_file = os.path.join(data_dir, "integers_words.pkl")
        self.words_integers_file = os.path.join(data_dir, "words_integers.pkl")
        self.map_words_to_integers()
        
        if training:
            if not os.path.exists(self.vocab_file):
                print("generating vocab from training data...")
                map_words_to_integers()
            print("loading vocab file...")
            vocab = pickle.load(open(self.vocab_file, 'rb'))
        else: #testing
            if not os.path.exists(self.vocab_file):
                raise RuntimeError("Could not find vocab file. Train first before testing!")
        
    def map_chars_to_integers(self):
        pass
        
    def map_words_to_integers(self):
        """Takes all the text in the training data and assigns an integer to each word."""
        with codecs.open(self.all_txt_file, "r", encoding=self.encoding) as f:
            data = f.read().lower()
        data = [x for x in re.split('(\W)', data) if x and x != " "]
        counter = collections.Counter(data)
        count_pairs = sorted(counter.items(), key=lambda x: -x[1])
        words, _ = zip(*count_pairs)
        vocab = set(words)
        pickle.dump(vocab, open(self.vocab_file, 'wb'))
        words_integers = dict(zip(words, range(len(words)))) #{word:integer}
        integers_words = dict(zip(range(len(words)), words)) #{integer:word}
        pickle.dump(words_integers, open(self.words_integers_file, 'wb'))
        pickle.dump(integers_words, open(self.integers_words_file, 'wb'))
        #self.tensor = np.array(list(map(vocab.get, data)))
        #np.save(tensor_file, self.tensor)
        
    def preprocess_words(self):
        """Uses the words_integers map to transform data to integers"""
        words_integers = pickle.load(open(self.words_integers_file, 'rb'))
        return np.array(list(map(words_integers.get, self.data)))

    def preprocess_chars(self):
        pass
    
    def load_preprocessed(self):
        """Loads word-integer mapping of text"""
        pass
    
    def create_batches(self, para_dict):
        """Groups paraIDs into groups of batch_size"""
        para_deque = collections.deque(para_dict)
        num_batches = int(len(para_deque)/batch_size)
        all_batches = []
        for _ in range(num_batches):
            this_batch = [para_deque.pop() for _ in range(batch_size)]
            all_batches.append(this_batch)
        last_batch = list(this_batch) #copy second last batch
        last_batch[:len(para_deque)] = list(para_deque) #replace some spots with leftovers
        all_batches.append(last_batch)
        return all_batches

    #TODO
    def next_batch(self):
        """Returns the next batch of data"""
        pass
        

In [45]:
"""Train.py"""
data_loader = DataLoader(data_dir, batch_size=10)

loading vocab file...


In [46]:
all_batches = data_loader.create_batches(para_dict)

In [47]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())

[['Tibet_0020',
  'Age_of_Enlightenment_0028',
  'Hellenistic_period_0041',
  'Rajasthan_0003',
  'Poultry_0004',
  'San_Diego_0012',
  'Israel_0076',
  'New_Haven,_Connecticut_0005',
  'Antenna_(radio)_0036',
  'Computer_0077'],
 ['New_York_City_0048',
  'Catalan_language_0029',
  'Madonna_(entertainer)_0013',
  'Institute_of_technology_0035',
  'United_States_Army_0025',
  'Appalachian_Mountains_0011',
  'History_of_India_0026',
  'Dwight_D._Eisenhower_0019',
  'Bird_migration_0034',
  'Central_African_Republic_0009'],
 ['The_Blitz_0022',
  'Greece_0008',
  'Alfred_North_Whitehead_0042',
  'Richard_Feynman_0013',
  'Russian_language_0003',
  'Jehovah%27s_Witnesses_0011',
  'North_Carolina_0025',
  'Kanye_West_0016',
  'Richmond,_Virginia_0038',
  'Dutch_language_0042'],
 ['Utrecht_0014',
  'Dwight_D._Eisenhower_0072',
  'Translation_0007',
  'John_Kerry_0024',
  'Estonia_0045',
  'Copyright_infringement_0006',
  'Oklahoma_City_0014',
  'Carnival_0017',
  'Montevideo_0018',
  'Univers