# Dataset

In [None]:
run ../utils/preprocess.py

In [None]:
category = "Video_Games"
C.INPUT_DATA_PATH = '../../data/90_input'
with open('%s/train-%s.pickle' % (C.INPUT_DATA_PATH, category), 'rb') as f:
    train_data = pd.read_pickle(f)

In [None]:
train_data.head()

In [None]:
vocab = Vocabulary(10000)

In [None]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')
    
    for ch in punctuations:
        text = text.replace(ch, " "+ch+" ")
    
    tokens = text.split()
    
    for i in range(len(tokens)):
        token = tokens[i]
        if token.isupper() == False:
            tokens[i] = token.lower()
    
    return tokens

In [None]:
tokenize("Hi, Hello! CMU. don't ")

In [None]:
for index, row in train_data.iterrows():
    questionsList = row['questionsList']
    for question in questionsList:
        if 'text' in question:
            text = question['text']
            vocab.add_sequence(tokenize(text))

            for answer in question['answers']:
                text = answer['text']
                vocab.add_sequence(tokenize(text))
    
    reviewsList = row['reviewsList']
    for review in reviewsList:
        text = review['text']
        vocab.add_sequence(tokenize(text))

In [None]:
vocab.sort_vocabulary()
vocab._token2index

In [None]:
# convert reviews row to json
def reviewsToIds(row):
    reviewIdsList = []
    for review in row:
        reviewJson = {}
        text = C.SOS + review[C.TEXT] + C.EOS
        reviewJson[C.IDS] = vocab.indices_from_sequence(tokenize(text))
        reviewIdsList.append(reviewJson)

    return reviewIdsList

In [None]:
def questionsToIds(questions_list):
    new_questions_list = []
    for question in questions_list:
        if C.TEXT in question:
            new_question = {}
            text = C.SOS + question[C.TEXT] + C.EOS
            new_question[C.IDS] = vocab.indices_from_sequence(tokenize(text))

            new_answers = []
            for answer in question[C.ANSWERS]:
                new_answer = {}
                text = C.SOS + answer[C.TEXT] + C.EOS
                new_answer[C.IDS] = vocab.indices_from_sequence(tokenize(text))
            new_answers.append(new_answer)

            new_question[C.ANSWER_IDS_LIST] = new_answers
            new_questions_list.append(new_question)

    return new_questions_list

In [None]:
train_data[C.REVIEW_IDS_LIST] = train_data[C.REVIEWS_LIST].apply(reviewsToIds)

In [None]:
train_data[C.QUESTION_IDS_LIST] = train_data[C.QUESTIONS_LIST].apply(questionsToIds)

In [None]:
train_data[C.QUESTION_IDS_LIST].iloc[0]

In [None]:
data = []
mode = "3"

for index, row in train_data.iterrows():
    questionIdsList = row[C.QUESTION_IDS_LIST]
    for question in questionIdsList:
        tup = ()
        
        if mode is not "1":
            ids = question[C.IDS]
            tup += (ids,)

        for answer in question[C.ANSWER_IDS_LIST]:
            ids = answer[C.IDS]
            data.append(tup+(ids,))
    
    if mode is "3":
        reviewsList = row[C.REVIEW_IDS_LIST]
        reviewIds = []
        
        for review in reviewsList[0:2]:
            ids = review[C.IDS]
            reviewIds.append(ids)
        
        for i in range(len(data)):
            data[i] += (reviewIds,)

In [None]:
data[0][2]

In [None]:
class Vocabulary(object):
    """
    A Vocabulary stores a set of words in the corpus mapped to unique integer IDs.

    In addition to the words in the actual language, a Vocabulary includes three
    reserved tokens (and IDs) for the start-of-sequence and end-of-sequence
    markers, and for a special 'UNK' marker used to handle rare/unknown words.

    The Vocabulary is sorted in descending order based on frequency. If the
    number of words seen is greater than the maximum size of the Vocabulary,
    the remaining least-frequent words are ignored.

    Args: size(int): maximum number of words allowed in this vocabulary
    """
    def __init__(self, max_vocab_size):
        self.PAD_token_name = "<PAD>"
        self.UNK_token_name = "<UNK>"
        self.SOS_token_name = "<SOS>"
        self.EOS_token_name = "<EOS>"
        self.PAD_token_id = 0
        self.UNK_token_id = 1
        self.SOS_token_id = 2
        self.EOS_token_id = 3

        self._reserved = set([self.PAD_token_name, self.UNK_token_name, \
                self.SOS_token_name, self.EOS_token_name])
        self._reserved_token_id = [
                (self.PAD_token_name, self.PAD_token_id),
                (self.UNK_token_name, self.UNK_token_id),
                (self.SOS_token_name, self.SOS_token_id),
                (self.EOS_token_name, self.EOS_token_id)
        ]

        self._token2index = dict([(tok, idx) for tok, idx in self._reserved_token_id])
        self._index2token = dict([(idx, tok) for tok, idx in self._reserved_token_id])

        self._token2count = {}

        self._num_tokens = 0
        self._num_reserved = 4
        self.sorted = False
        self.size = max_vocab_size

    def trim(self):
        """
        Sorts the vocabulary in descending order based on frequency
        """
        sorted_vocab_count = sorted(self._token2count.items(), \
                key=lambda x: x[1], reverse=True)[:self.size]
        self._token2index = dict( [ (w, self._num_reserved + idx) \
                for idx, (w, _) in enumerate(sorted_vocab_count) ] )
        self._index2token = dict( [ (idx, w) \
                for w, idx in self._token2index.items() ])

        for tok, idx in self._reserved_token_id:
            self._token2index[tok] = idx
            self._index2token[idx] = tok

        if self._num_tokens > self.size:
            self._num_tokens = self.size

        self.sorted = True

    def sort_vocabulary(self):
        """
        Sorts the vocabulary (if it is not already sorted).
        """
        if not self.sorted:
            self.trim()


    def get_index(self, token):
        """
        Returns: int: ID of the given token.
        """
        self.sort_vocabulary()
        return self._token2index[token]


    def get_token(self, index):
        """
        Returns: str: token with ID equal to the given index.
        """
        self.sort_vocabulary()
        return self._index2token[index]


    def get_vocab_size(self):
        """
        Returns: int: maximum number of words in the vocabulary.
        """
        self.sort_vocabulary()
        return self._num_tokens + self._num_reserved


    def add_token(self, token):
        """
        Adds an occurrence of a token to the vocabulary,
        incrementing its observed frequency if the word already exists.
        Args: token (int): word to add
        """
        if token in self._reserved:
            return
        if token not in self._token2count:
            self._token2count[token] = 1
            self._num_tokens += 1
        else:
            self._token2count[token] += 1
        self.sorted = False

    def add_sequence(self, sequence):
        """
        Adds a sequence of words to the vocabulary.
        Args: sequence(list(str)): list of words, e.g. representing a sentence.
        """
        for tok in sequence:
            self.add_token(tok)

    def indices_from_sequence(self, sequence):
        """
        Maps a list of words to their token IDs, or else <UNK>
        if the word is rare/unknown.
        Args: sequence (list(str)): list of words to map
        Returns: list(int): list of mapped IDs
        """
        self.sort_vocabulary()
        return [self._token2index[tok]
                if tok in self._token2index
                else self.UNK_token_id
                for tok in sequence]

    def sequence_from_indices(self, indices):
        """
        Recover a sentence from a list of token IDs.
        Args: indices (list(int)): list of token IDs.
        Returns: list(str): recovered sentence, represented as a list of words
        """
        seq = [self._index2token[idx] for idx in indices]
        return seq


    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False
        self.sort_vocabulary()
        other.sort_vocabulary()

        if self._token2count == other._token2count \
                and self._token2index == other._token2index \
                and self._index2token == other._index2token:
            return True
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(self._token2index)


# DataLoader

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

import numpy as np
from operator import itemgetter, attrgetter
from torch.utils.data import Dataset, DataLoader

In [25]:
reviews1 = [[1,2], [1,2,3],[6,7,8]]
reviews2 = [[1], [1,2]]

question1 = [1,2,3]
question2 = [2,3,4,5]

answer1 = [1,3]
answer2 = [2]

data1 = [(answer1), (answer2)]
data2 = [(answer2, question1), (answer1, question2)]
data3 = [(answer1, question1, reviews2), (answer2, question2, reviews1)]

In [26]:
print(data1)
print(data2)
print(data3)

[[1, 3], [2]]
[([2], [1, 2, 3]), ([1, 3], [2, 3, 4, 5])]
[([1, 3], [1, 2, 3], [[1], [1, 2]]), ([2], [2, 3, 4, 5], [[1, 2], [1, 2, 3], [6, 7, 8]])]


In [None]:
a, q = zip(*data2)
print(a)
print(q)

In [22]:
class AmazonDataLoader(DataLoader):

    def sortByLength(self, item):
        if self.mode is "1":
            return len(item)
        
        elif self.mode is "2":
            assert(len(item) == 2)
            print(item[0])
            return len(item[0])
        
        elif self.mode is "3":
            assert(len(item) == 3)
            reviews = item[2]
            max_len = 0
            for review in reviews:
                max_len = max(max_len, len(review))
            return max_len

        
    def __init__(self, data, mode, batch_size):
        self.batch_size = batch_size
        self.mode = mode
        
        data = sorted(data, key=self.sortByLength, reverse=True) 
        self.data = data
        

    def create_packed_qa(self, batch_data):
        lengths = np.array([len(item) for item in batch_data])
        max_len = max(lengths)
        
        padded_data = np.array( [np.pad(item, (0, max_len-len(item)), 'constant') for item in batch_data] )
        padded_data = torch.from_numpy(padded_data)
        
        return (padded_data)
    
    def create_packed_reviews(self, review_data):
        max_num_reviews = 0
        for reviews in review_data:
            max_num_reviews = max(max_num_reviews, len(reviews))
        
        data = []
        for i in range(max_num_reviews):
            batch_data = []
            for j in range(self.batch_size):
                reviews = review_data[j]
                if i < len(reviews):
                    batch_data.append(reviews[i])
                else:
                    batch_data.append([0])
            data.append(batch_data)
        
        padded_data = []
        for i in range(max_num_reviews):
            batch_data = data[i]
            lengths = [len(review) for review in batch_data]
            max_len = max(lengths)
            
            padded_batch_data = np.array([np.pad(item, (0, max_len-len(item)), 'constant') for item in batch_data])
            padded_batch_data = torch.from_numpy(padded_batch_data)
            
            padded_data.append(padded_batch_data)
            
        return padded_data
        
    def __iter__(self):
        print(self.data)
        self.num_batches = len(self.data) // self.batch_size
        indices = np.arange(self.num_batches)
        np.random.shuffle(indices)

        for index in indices:
            start = index*self.batch_size
            end = (index+1)*self.batch_size
            
            batch_data = self.data[start:end]
            assert(self.batch_size == len(batch_data))
            
            if self.mode is "1":
                answers = batch_data
                print(answers)
                packed_answers = self.create_packed_qa(list(answers))
                yield (packed_answers)
                
            elif self.mode is "2":
                answers, questions = zip(*batch_data)
                packed_answers = self.create_packed_qa(list(answers))
                packed_questions = self.create_packed_qa(list(questions))
                yield (packed_answers, packed_questions)
            
            elif self.mode is "3":
                answers, questions, reviews = zip(*batch_data)
                packed_answers = self.create_packed_qa(list(answers))
                packed_questions = self.create_packed_qa(list(questions))
                packed_reviews = self.create_packed_reviews(list(reviews))
                yield (packed_answers, packed_questions, packed_reviews)
                
    def __len__(self):
        return self.num_batches

In [27]:
data_loader = AmazonDataLoader(data3, "3", 2)

In [28]:
for batch_idx, data in enumerate(data_loader):
    answers = data
    print(data)

[([2], [2, 3, 4, 5], [[1, 2], [1, 2, 3], [6, 7, 8]]), ([1, 3], [1, 2, 3], [[1], [1, 2]])]
(
 2  0
 1  3
[torch.LongTensor of size (2,2)]
, 
 2  3  4  5
 1  2  3  0
[torch.LongTensor of size (2,4)]
, [
 1  2
 1  0
[torch.LongTensor of size (2,2)]
, 
 1  2  3
 1  2  0
[torch.LongTensor of size (2,3)]
, 
 6  7  8
 0  0  0
[torch.LongTensor of size (2,3)]
])


In [None]:
def sortByMaxReviewLength(item):
    reviews = item[2]
    max_len = 0
    for review in reviews:
        max_len = max(max_len, review.shape[0])
    return max_len

for item in data3:
    assert(len(item) == 3)

sorted(data3, key=sortByMaxReviewLength, reverse=True)

In [None]:
def sortByAnswerLength(item):
    return item[0].shape[0]

sorted(data2, key=sortByAnswerLength, reverse=True)

In [None]:
answers = data1
print(answers)

answers, questions = zip(*data2)
print(data2)
print(answers)
print(questions)

answers, questions, reviews = zip(*data3)
print(data3)
print(answers)
print(questions)
print(reviews)

In [None]:
x = Variable(torch.randn(10, 10, 30))
lens = list(range(1, 11))
x = pack_padded_sequence(x, lens[::-1], batch_first=True)
y = pad_packed_sequence(x)