In [1]:
from textdataBeer import TextDataBeer
import time, sys
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import time, datetime
import math, random
import nltk
import pickle
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
# import matplotlib.pyplot as plt
import numpy as np
import copy
from LanguageModel_beer import LanguageModel

import LSTM_IB_GAN_beer



In [3]:

import numpy as np
import nltk  # For tokenize
from tqdm import tqdm  # Progress bar
import pickle  # Saving the data
import math  # For float comparison
import os  # Checking file existance
import random, gzip
import string, copy
from nltk.tokenize import word_tokenize
import jieba
import json
from Hyperparameters import args

class Batch:
    """Struct containing batches info
    """
    def __init__(self):
        self.encoderSeqs = []
        self.encoder_lens = []
        self.label = []
        self.decoderSeqs = []
        self.targetSeqs = []
        self.decoder_lens = []
        self.rationals = []
        self.raw = []


class TextDataBeer:
    """Dataset class
    Warning: No vocabulary limit
    """


    def __init__(self, corpusname, trainLM =False):

        """Load all conversations
        Args:
            args: parameters of the model
        """

        # Path variables
        if corpusname == 'cail':
            self.tokenizer = lambda x: list(jieba.cut(x))
        elif corpusname == 'beer':
            self.tokenizer = word_tokenize

        self.trainingSamples = []  # 2d array containing each question and his answer [[input,target]]
        if not trainLM:
            self.datasets= self.loadCorpus_Beer()
        else:
            self.datasets = self.load3aspects()


        print('set')
        # Plot some stats:
        self._printStats(corpusname)

        if args['playDataset']:
            self.playDataset()

        self.batches = {}

    def _printStats(self, corpusname):
        print('Loaded {}: {} words, {} QA'.format(corpusname, len(self.word2index), len(self.trainingSamples)))


    def shuffle(self):
        """Shuffle the training samples
        """
        print('Shuffling the dataset...')
        random.shuffle(self.datasets['train'])

    def _createBatch(self, samples):
        """Create a single batch from the list of sample. The batch size is automatically defined by the number of
        samples given.
        The inputs should already be inverted. The target should already have <go> and <eos>
        Warning: This function should not make direct calls to args['batchSize'] !!!
        Args:
            samples (list<Obj>): a list of samples, each sample being on the form [input, target]
        Return:
            Batch: a batch object en
        """

        batch = Batch()
        batchSize = len(samples)

        # Create the batch tensor
        for i in range(batchSize):
            # Unpack the sample
            sen_ids, y, raw_sen, rational = samples[i]

            if len(sen_ids) > args['maxLengthEnco']:
                sen_ids = sen_ids[:args['maxLengthEnco']]

            batch.encoderSeqs.append(sen_ids)
            batch.encoder_lens.append(len(batch.encoderSeqs[i]))
            batch.label.append(y)
            batch.rationals.append(rational)
            batch.raw.append(raw_sen)
            # print(y)

        maxlen_enc = max(batch.encoder_lens)


        for i in range(batchSize):
            batch.encoderSeqs[i] = batch.encoderSeqs[i] + [self.word2index['PAD']] * (maxlen_enc - len(batch.encoderSeqs[i]))


        return batch

    def getBatches(self, setname = 'train'):
        """Prepare the batches for the current epoch
        Return:
            list<Batch>: Get a list of the batches for the next epoch
        """
        if setname not in self.batches:
            # self.shuffle()

            batches = []
            print(setname, 'size:', len(self.datasets[setname]))
            def genNextSamples():
                """ Generator over the mini-batch training samples
                """
                for i in range(0, self.getSampleSize(setname), args['batchSize']):
                    yield self.datasets[setname][i:min(i + args['batchSize'], self.getSampleSize(setname))]

            # TODO: Should replace that by generator (better: by tf.queue)

            for index, samples in enumerate(genNextSamples()):
                # print([self.index2word[id] for id in samples[5][0]], samples[5][2])
                batch = self._createBatch(samples)
                batches.append(batch)

            self.batches[setname] = batches

        # print([self.index2word[id] for id in batches[2].encoderSeqs[5]], batches[2].raws[5])
        return self.batches[setname]

    def _createBatch_forLM(self, samples):
        """Create a single batch from the list of sample. The batch size is automatically defined by the number of
        samples given.
        The inputs should already be inverted. The target should already have <go> and <eos>
        Warning: This function should not make direct calls to args['batchSize'] !!!
        Args:
            samples (list<Obj>): a list of samples, each sample being on the form [input, target]
        Return:
            Batch: a batch object en
        """

        batch = Batch()
        batchSize = len(samples)

        # Create the batch tensor
        for i in range(batchSize):
            # Unpack the sample
            sen_ids = samples[i]
            if len(sen_ids) > args['maxLengthEnco']:
                sen_ids = sen_ids[:args['maxLengthEnco']]
            batch.decoderSeqs.append([self.word2index['START_TOKEN']] + sen_ids)
            batch.decoder_lens.append(len(batch.decoderSeqs[i]))
            batch.targetSeqs.append(sen_ids + [self.word2index['END_TOKEN']])

        # print(batch.decoderSeqs)
        # print(batch.decoder_lens)
        maxlen_dec = max(batch.decoder_lens)
        maxlen_dec = min(maxlen_dec, args['maxLengthEnco'])

        for i in range(batchSize):
            batch.decoderSeqs[i] = batch.decoderSeqs[i] + [self.word2index['PAD']] * (maxlen_dec - len(batch.decoderSeqs[i]))
            batch.targetSeqs[i] = batch.targetSeqs[i] + [self.word2index['PAD']] * (maxlen_dec - len(batch.targetSeqs[i]))

        return batch

    def paragraph2sentence(self, doclist):
        split_tokens = [self.word2index['.']]
        sen_list = []
        for sen_ids, y, raw_sen, rational in doclist:
            start = 0
            for ind, w in enumerate(sen_ids):
                if w in split_tokens:
                    sen_list.append(sen_ids[start:ind + 1])
                    start = ind + 1

            if start < len(sen_ids) - 1:
                sen_list.append(sen_ids[start:])

        return sen_list

    def getBatches_forLM(self, setname = 'train'):
        """Prepare the batches for the current epoch
        Return:
            list<Batch>: Get a list of the batches for the next epoch
        """
        if setname not in self.batches:
            self.shuffle()

            dataset_sen = self.paragraph2sentence(self.datasets[setname])
            sennum = len(dataset_sen)
            print(sennum)

            batches = []
            print(len(self.datasets[setname]))
            def genNextSamples():
                """ Generator over the mini-batch training samples
                """
                for i in range(0, sennum, args['batchSize']):
                    yield dataset_sen[i:min(i + args['batchSize'], sennum)]

            # TODO: Should replace that by generator (better: by tf.queue)

            for index, samples in enumerate(genNextSamples()):
                # print([self.index2word[id] for id in samples[5][0]], samples[5][2])
                batch = self._createBatch_forLM(samples)
                batches.append(batch)

            self.batches[setname] = batches

        # print([self.index2word[id] for id in batches[2].encoderSeqs[5]], batches[2].raws[5])
        return self.batches[setname]

    def getSampleSize(self, setname = 'train'):
        """Return the size of the dataset
        Return:
            int: Number of training samples
        """
        return len(self.datasets[setname])

    def getVocabularySize(self):
        """Return the number of words present in the dataset
        Return:
            int: Number of word on the loader corpus
        """
        return len(self.word2index)


    def loadCorpus_Beer(self):
        """Load/create the conversations data
        """
        self.basedir = './data/beer/'
        self.corpus_file_train = self.basedir + 'reviews.aspect'+str(args['aspect'])+'.train.txt'
        self.corpus_file_dev =  self.basedir + 'reviews.aspect'+str(args['aspect'])+'.heldout.txt'
        self.corpus_file_test =  self.basedir + 'annotations.json'
        self.embfile = self.basedir + 'review+wiki.filtered.200.txt.gz'
        self.data_dump_path = args['rootDir'] + '/Beerdata'+str(args['aspect'])+'.pkl'

        print(self.data_dump_path)
        datasetExist = os.path.isfile(self.data_dump_path)

        if not datasetExist:  # First time we load the database: creating all files
            print('Training data not found. Creating dataset...')

            total_words = []
            dataset = {'train': [], 'dev':[], 'test':[]}

            self.word2index, self.index2word, self.index2vector = self.read_word2vec_from_pretrained(self.embfile,
                                                                                                     topk_word_num=-1)
            self.index2word_set = set(self.index2word)

            with open(self.corpus_file_train, 'r',encoding="utf-8") as rhandle:
                lines = rhandle.readlines()

                for line in tqdm(lines):
                    y, sep, x = line.partition("\t")
                    x, y = x.split(), y.split()
                    if len(x) == 0: continue
                    y = np.asarray([float(v) for v in y])

                    dataset['train'].append((x, y, -1))

            with open(self.corpus_file_dev, 'r') as rhandle:
                lines = rhandle.readlines()

                for line in tqdm(lines):
                    y, sep, x = line.partition("\t")
                    x, y = x.split(), y.split()
                    if len(x) == 0: continue
                    y = np.asarray([float(v) for v in y])

                    dataset['dev'].append((x, y, -1))

            with open(self.corpus_file_test, 'r') as rhandle:
                for line in tqdm(rhandle.readlines()):
                    review = json.loads(line)
                    word_seq = review['x']
                    y = np.asarray(review['y'])
                    # raw_x = eval(review['raw'])['review/text']
                    # words = self.tokenizer(raw_x.lower())
                    rational={}
                    for i in range(5):
                        intervals = review[str(i)]
                        all_rw_in_rational = []
                        for start,end in intervals:
                            rw = word_seq[start:end]
                            all_rw_in_rational.extend(rw)
                        rational[i]=set([self.word2index[w] for w in set(all_rw_in_rational) if w in self.word2index])


                    dataset['test'].append((word_seq, y, rational))

            print(len(dataset['train']), len(dataset['dev']), len(dataset['test']))


            # self.raw_sentences = copy.deepcopy(dataset)
            for setname in ['train', 'dev', 'test']:
                dataset[setname] = [(self.TurnWordID(sen), y, sen, rational) for sen, y, rational in tqdm(dataset[setname])]

            # Saving
            print('Saving dataset...')
            self.saveDataset(self.data_dump_path, dataset)  # Saving tf samples
        else:
            dataset = self.loadDataset(self.data_dump_path)
            print('loaded')

        return  dataset

    def saveDataset(self, filename, datasets):
        """Save samples to file
        Args:
            filename (str): pickle filename
        """
        with open(os.path.join(filename), 'wb') as handle:
            data = {  # Warning: If adding something here, also modifying loadDataset
                'word2index': self.word2index,
                'index2word': self.index2word,
                'index2vector': self.index2vector,
                'datasets': datasets
            }
            pickle.dump(data, handle, -1)  # Using the highest protocol available


    def loadDataset(self, filename):
        """Load samples from file
        Args:
            filename (str): pickle filename
        """
        dataset_path = os.path.join(filename)
        print('Loading dataset from {}'.format(dataset_path))
        with open(dataset_path, 'rb') as handle:
            data = pickle.load(handle)  # Warning: If adding something here, also modifying saveDataset
            self.word2index = data['word2index']
            self.index2word = data['index2word']
            self.index2vector = data['index2vector']
            datasets = data['datasets']
            dataset_all = data
        print('training: \t', len(datasets['train']))
        print('dev: \t', len(datasets['dev']))
        print('testing: \t', len(datasets['test']))
        self.index2word_set = set(self.index2word)
        print('w2i shape: ', len(self.word2index))
        print('i2w shape: ', len(self.index2word))
        print('embeding shape: ', self.index2vector.shape)
        return  datasets, data

    def load3aspects(self):
        self.data_dump_path1 = args['rootDir'] + '/Beerdata0.pkl'
        self.data_dump_path2 = args['rootDir'] + '/Beerdata1.pkl'
        self.data_dump_path3 = args['rootDir'] + '/Beerdata2.pkl'
        d1 = self.loadDataset(self.data_dump_path1)
        d2 = self.loadDataset(self.data_dump_path2)
        d3 = self.loadDataset(self.data_dump_path3)
        data={'train':[], 'dev':[], 'test':[]}
        data['train']=d1['train']+d2['train']+d3['train']
        data['dev']=d1['dev']+d2['dev']+d3['dev']
        data['test']=d1['test']
        return data

    def read_word2vec(self, vocfile ):
        word2index = dict()
        word2index['PAD'] = 0
        word2index['START_TOKEN'] = 1
        word2index['END_TOKEN'] = 2
        word2index['UNK'] = 3
        cnt = 4
        with open(vocfile, "r") as v:

            for line in v:
                word = line.strip().split()[0]
                word2index[word] = cnt
                print(word,cnt)
                cnt += 1

        print(len(word2index),cnt)
        # dic = {w:numpy.random.normal(size=[int(sys.argv[1])]).astype('float32') for w in word2index}
        print ('Dictionary Got!')
        return word2index

    def read_word2vec_from_pretrained(self, embfile, topk_word_num=-1):
        fopen = gzip.open if embfile.endswith(".gz") else open
        word2index = dict()
        word2index['PAD'] = 0
        word2index['START_TOKEN'] = 1
        word2index['END_TOKEN'] = 2
        word2index['UNK'] = 3
        # word2index['PAD'] = 1
        # word2index['UNK'] = 0

        cnt = 4
        vectordim = -1
        index2vector = []
        with fopen(embfile, "r") as v:
            lines = v.readlines()
            if topk_word_num > 0:
                lines = lines[:topk_word_num]
            for line in tqdm(lines):
                word_vec = line.strip().split()
                word = bytes.decode(word_vec[0])
                vector = np.asarray([float(value) for value in word_vec[1:]])
                if vectordim == -1:
                    vectordim = len(vector)
                index2vector.append(vector)
                word2index[word] = cnt
                print(word, cnt)
                cnt += 1
        print('before add special token:' , len(index2vector))
        index2vector = [np.random.normal(size=[vectordim]).astype('float32') for _ in range(4)] + index2vector
        print('after add special token:' ,len(index2vector))
        index2vector = np.asarray(index2vector)
        index2word = [w for w, n in word2index.items()]
        print(len(word2index), cnt)
        print('Dictionary Got!')
        return word2index, index2word, index2vector

    def TurnWordID(self, words):
        res = []
        for w in words:
            w = w.lower()
            if w in self.index2word_set:
                id = self.word2index[w]
                res.append(id)
            else:
                res.append(self.word2index['UNK'])
        return res


    def printBatch(self, batch):
        """Print a complete batch, useful for debugging
        Args:
            batch (Batch): a batch object
        """
        print('----- Print batch -----')
        for i in range(len(batch.encoderSeqs[0])):  # Batch size
            print('Encoder: {}'.format(self.batchSeq2str(batch.encoderSeqs, seqId=i)))
            print('Decoder: {}'.format(self.batchSeq2str(batch.decoderSeqs, seqId=i)))
            print('Targets: {}'.format(self.batchSeq2str(batch.targetSeqs, seqId=i)))
            print('Weights: {}'.format(' '.join([str(weight) for weight in [batchWeight[i] for batchWeight in batch.weights]])))

    def sequence2str(self, sequence, clean=False, reverse=False):
        """Convert a list of integer into a human readable string
        Args:
            sequence (list<int>): the sentence to print
            clean (Bool): if set, remove the <go>, <pad> and <eos> tokens
            reverse (Bool): for the input, option to restore the standard order
        Return:
            str: the sentence
        """

        if not sequence:
            return ''

        if not clean:
            return ' '.join([self.index2word[idx] for idx in sequence])

        sentence = []
        for wordId in sequence:
            if wordId == self.word2index['END_TOKEN']:  # End of generated sentence
                break
            elif wordId != self.word2index['PAD'] and wordId != self.word2index['START_TOKEN']:
                sentence.append(self.index2word[wordId])

        if reverse:  # Reverse means input so no <eos> (otherwise pb with previous early stop)
            sentence.reverse()

        return self.detokenize(sentence)

    def detokenize(self, tokens):
        """Slightly cleaner version of joining with spaces.
        Args:
            tokens (list<string>): the sentence to print
        Return:
            str: the sentence
        """
        return ''.join([
            ' ' + t if not t.startswith('\'') and
                       t not in string.punctuation
                    else t
            for t in tokens]).strip().capitalize()

    def batchSeq2str(self, batchSeq, seqId=0, **kwargs):
        """Convert a list of integer into a human readable string.
        The difference between the previous function is that on a batch object, the values have been reorganized as
        batch instead of sentence.
        Args:
            batchSeq (list<list<int>>): the sentence(s) to print
            seqId (int): the position of the sequence inside the batch
            kwargs: the formatting options( See sequence2str() )
        Return:
            str: the sentence
        """
        sequence = []
        for i in range(len(batchSeq)):  # Sequence length
            sequence.append(batchSeq[i][seqId])
        return self.sequence2str(sequence, **kwargs)

    def sentence2enco(self, sentence):
        """Encode a sequence and return a batch as an input for the model
        Return:
            Batch: a batch object containing the sentence, or none if something went wrong
        """

        if sentence == '':
            return None

        # First step: Divide the sentence in token
        tokens = nltk.word_tokenize(sentence)
        if len(tokens) > args['maxLength']:
            return None

        # Second step: Convert the token in word ids
        wordIds = []
        for token in tokens:
            wordIds.append(self.getWordId(token, create=False))  # Create the vocabulary and the training sentences

        # Third step: creating the batch (add padding, reverse)
        batch = self._createBatch([[wordIds, []]])  # Mono batch, no target output

        return batch

    def deco2sentence(self, decoderOutputs):
        """Decode the output of the decoder and return a human friendly sentence
        decoderOutputs (list<np.array>):
        """
        sequence = []

        # Choose the words with the highest prediction score
        for out in decoderOutputs:
            sequence.append(np.argmax(out))  # Adding each predicted word ids

        return sequence  # We return the raw sentence. Let the caller do some cleaning eventually

    def playDataset(self):
        """Print a random dialogue from the dataset
        """
        print('Randomly play samples:')
        print(len(self.datasets['train']))
        for i in range(args['playDataset']):
            idSample = random.randint(0, len(self.datasets['train']) - 1)
            print('sen: {} {}'.format(self.sequence2str(self.datasets['train'][idSample][0], clean=True), self.datasets['train'][idSample][1]))
            print()
        pass


def tqdm_wrap(iterable, *args, **kwargs):
    """Forward an iterable eventually wrapped around a tqdm decorator
    The iterable is only wrapped if the iterable contains enough elements
    Args:
        iterable (list): An iterable object which define the __len__ method
        *args, **kwargs: the tqdm parameters
    Return:
        iter: The iterable eventually decorated
    """
    if len(iterable) > 100:
        return tqdm(iterable, *args, **kwargs)
    return iterable


In [None]:
args = {'aspect':0}

In [5]:
# from textdataBeer import TextDataBeer

class test_beer_data:
    def __init__(self, filename):
        self.filename = filename
        print("in init")
        print(self.filename)
        
        
    def return_dataset(self):
        self.dataset = TextDataBeer.loadDataset(self, self.filename)
        return self.dataset
        
    

In [8]:
test_beer = test_beer_data("./artifacts/Beerdata0.pkl")



in init
./artifacts/Beerdata0.pkl


In [9]:
datasets, data_all = test_beer.return_dataset()

Loading dataset from ./artifacts/Beerdata0.pkl
training: 	 80000
dev: 	 10000
testing: 	 994
w2i shape:  147763
i2w shape:  147763
embeding shape:  (147763, 200)


In [47]:
datasets.keys()

dict_keys(['train', 'dev', 'test'])

In [48]:
data_all.keys()

dict_keys(['word2index', 'index2word', 'index2vector', 'datasets'])

In [8]:
data_all['word2index']

{'PAD': 0,
 'START_TOKEN': 1,
 'END_TOKEN': 2,
 'UNK': 3,
 'the': 4,
 '.': 5,
 'a': 6,
 ',': 7,
 'of': 8,
 'and': 9,
 'is': 10,
 'to': 11,
 'in': 12,
 'one': 13,
 'with': 14,
 'it': 15,
 'i': 16,
 'zero': 17,
 'this': 18,
 'two': 19,
 'that': 20,
 'but': 21,
 'for': 22,
 'as': 23,
 'nine': 24,
 'was': 25,
 'beer': 26,
 'on': 27,
 'not': 28,
 'very': 29,
 ':': 30,
 'head': 31,
 'three': 32,
 'eight': 33,
 'four': 34,
 'five': 35,
 'from': 36,
 'some': 37,
 'are': 38,
 'six': 39,
 'at': 40,
 'seven': 41,
 'an': 42,
 'taste': 43,
 'by': 44,
 's': 45,
 'nice': 46,
 'good': 47,
 "'s": 48,
 'there': 49,
 'light': 50,
 'like': 51,
 'be': 52,
 'malt': 53,
 'more': 54,
 'hops': 55,
 'or': 56,
 '-': 57,
 'sweet': 58,
 'have': 59,
 'bit': 60,
 'has': 61,
 'carbonation': 62,
 'flavor': 63,
 'dark': 64,
 'well': 65,
 "n't": 66,
 'aroma': 67,
 'my': 68,
 'little': 69,
 'had': 70,
 'all': 71,
 'out': 72,
 'up': 73,
 'which': 74,
 'into': 75,
 'color': 76,
 'glass': 77,
 'smell': 78,
 'finish': 79,
 '

In [9]:
data_all['index2word']

['PAD',
 'START_TOKEN',
 'END_TOKEN',
 'UNK',
 'the',
 '.',
 'a',
 ',',
 'of',
 'and',
 'is',
 'to',
 'in',
 'one',
 'with',
 'it',
 'i',
 'zero',
 'this',
 'two',
 'that',
 'but',
 'for',
 'as',
 'nine',
 'was',
 'beer',
 'on',
 'not',
 'very',
 ':',
 'head',
 'three',
 'eight',
 'four',
 'five',
 'from',
 'some',
 'are',
 'six',
 'at',
 'seven',
 'an',
 'taste',
 'by',
 's',
 'nice',
 'good',
 "'s",
 'there',
 'light',
 'like',
 'be',
 'malt',
 'more',
 'hops',
 'or',
 '-',
 'sweet',
 'have',
 'bit',
 'has',
 'carbonation',
 'flavor',
 'dark',
 'well',
 "n't",
 'aroma',
 'my',
 'little',
 'had',
 'all',
 'out',
 'up',
 'which',
 'into',
 'color',
 'glass',
 'smell',
 'finish',
 'bottle',
 'just',
 'you',
 'mouthfeel',
 'would',
 'pours',
 'hop',
 'than',
 'no',
 'lacing',
 'body',
 'white',
 'also',
 '...',
 'much',
 'brown',
 'really',
 'alcohol',
 ')',
 'medium',
 'caramel',
 'great',
 '!',
 'its',
 'chocolate',
 'me',
 'poured',
 '(',
 'first',
 'too',
 'so',
 'his',
 'other',
 'n

In [11]:
data_all['word2index'][',']

7

In [8]:
data_all["index2vector"][0]

array([-4.38027054e-01,  2.39060545e+00, -1.01396251e+00,  1.69249162e-01,
        1.96114197e-01, -1.15109585e-01,  8.24877739e-01, -1.55759358e+00,
       -2.29276195e-01, -9.80664860e-04,  4.45391178e-01,  5.16192794e-01,
       -9.95504916e-01,  2.45408490e-01, -8.56612682e-01,  1.44205511e+00,
       -1.22433051e-01, -2.16272026e-01, -2.04418764e-01,  4.06475335e-01,
        2.81324887e+00, -9.10368919e-01, -4.36727554e-01,  3.83445323e-01,
        7.90370628e-02, -5.34986377e-01,  9.95387256e-01,  1.85926068e+00,
        6.98307276e-01,  9.07113433e-01,  1.51528716e+00,  8.76923621e-01,
        1.13870457e-01, -8.01007986e-01,  9.25668359e-01,  3.51027548e-01,
       -6.76974356e-01, -1.82705462e+00, -9.52161074e-01,  6.80048048e-01,
       -5.65241992e-01,  1.15620680e-01,  6.47632629e-02,  1.42970896e+00,
        1.26917517e+00, -2.49857068e+00,  2.81687140e-01,  1.51361096e+00,
       -5.62527597e-01, -4.73937094e-01,  1.51258504e+00,  7.72027254e-01,
        1.40174854e+00, -

In [30]:
class Runner:
    def __init__(self):
        self.model_path = args['rootDir'] + '/chargemodel_' + args['model_arch'] + '.mdl'

    def main(self):

        self.textData = TextDataBeer('beer')
        # self.start_token = self.textData.word2index['START_TOKEN']
        # self.end_token = self.textData.word2index['END_TOKEN']
        args['vocabularySize'] = self.textData.getVocabularySize()
        args['chargenum'] = 5
        args['embeddingSize'] = self.textData.index2vector.shape[1]
        print(self.textData.getVocabularySize())
        args['model_arch'] = 'lstmibgan'
        # args['aspect'] = 0
        args['hiddenSize'] = 200

        print(args)
        if args['model_arch'] == 'lstmibgan':
            print('Using LSTM information bottleneck GAN model for Beer.')
            LM = torch.load(args['rootDir']+'/LMbeer3.pkl', map_location=args['device'])
            for param in LM.parameters():
                param.requires_grad = False

            ppl = self.CalPPL(LM)
            print('PPL=',ppl)
            # LM=0
            LSTM_IB_GAN_beer.train(self.textData, LM, self.textData.index2vector)

AttributeError: 'dict' object has no attribute 'word2index'

In [60]:
len(datasets['train'])

80000

In [10]:
import pandas as pd
test_df = pd.DataFrame(datasets['train'][0:10])
test_df.head()
                 

Unnamed: 0,0,1,2,3
0,"[4, 689, 1009, 14, 18, 26, 10, 20, 15, 61, 88,...","[0.3, 0.3, 0.5, 0.3, 0.3]","[the, main, problem, with, this, beer, is, tha...",-1
1,"[15, 10, 29, 6747, 18, 2852, 466, 59, 187, 5, ...","[0.4, 0.4, 0.6, 0.4, 0.6]","[it, is, very, unfortunate, this, situation, w...",-1
2,"[202, 10, 6, 50, 199, 311, 14, 6, 157, 235, 31...","[0.8, 0.5, 0.6, 0.3, 0.4]","[appearance, is, a, light, golden, yellow, wit...",-1
3,"[15, 61, 6, 101, 76, 11, 4, 90, 5, 18, 26, 106...","[0.9, 0.7, 0.4, 0.7, 0.6]","[it, has, a, great, color, to, the, body, ., t...",-1
4,"[181, 18, 26, 10, 7, 56, 913, 7, 28, 398, 7, 1...","[0.8, 0.9, 0.2, 0.3, 0.2]","[though, this, beer, is, ,, or, course, ,, not...",-1


In [32]:
test_df[0]

0    [4, 689, 1009, 14, 18, 26, 10, 20, 15, 61, 88,...
1    [15, 10, 29, 6747, 18, 2852, 466, 59, 187, 5, ...
2    [202, 10, 6, 50, 199, 311, 14, 6, 157, 235, 31...
3    [15, 61, 6, 101, 76, 11, 4, 90, 5, 18, 26, 106...
4    [181, 18, 26, 10, 7, 56, 913, 7, 28, 398, 7, 1...
5    [215, 44, 100766, 5, 585, 80, 93, 6, 30, 218, ...
6    [1192, 7911, 1082, 953, 106, 75, 2622, 1569, 5...
7    [67, 30, 46, 3700, 243, 67, 7, 14, 81, 6, 69, ...
8    [106, 36, 8820, 40, 4949, 75, 3929, 387, 3526,...
9    [344, 6, 425, 779, 8, 18, 22, 630, 15338, 18, ...
Name: 0, dtype: object

In [30]:
pd.DataFrame(np.asarray(test_df))

Unnamed: 0,0,1,2,3
0,"[4, 689, 1009, 14, 18, 26, 10, 20, 15, 61, 88,...","[0.3, 0.3, 0.5, 0.3, 0.3]","[the, main, problem, with, this, beer, is, tha...",-1
1,"[15, 10, 29, 6747, 18, 2852, 466, 59, 187, 5, ...","[0.4, 0.4, 0.6, 0.4, 0.6]","[it, is, very, unfortunate, this, situation, w...",-1
2,"[202, 10, 6, 50, 199, 311, 14, 6, 157, 235, 31...","[0.8, 0.5, 0.6, 0.3, 0.4]","[appearance, is, a, light, golden, yellow, wit...",-1
3,"[15, 61, 6, 101, 76, 11, 4, 90, 5, 18, 26, 106...","[0.9, 0.7, 0.4, 0.7, 0.6]","[it, has, a, great, color, to, the, body, ., t...",-1
4,"[181, 18, 26, 10, 7, 56, 913, 7, 28, 398, 7, 1...","[0.8, 0.9, 0.2, 0.3, 0.2]","[though, this, beer, is, ,, or, course, ,, not...",-1
5,"[215, 44, 100766, 5, 585, 80, 93, 6, 30, 218, ...","[0.8, 0.5, 0.5, 0.4, 0.8]","[best, by, 10/31/10, ., 12oz, bottle, ..., a, ...",-1
6,"[1192, 7911, 1082, 953, 106, 75, 2622, 1569, 5...","[0.6, 1.0, 0.7, 1.0, 0.8]","[vintage, 07, batch, 3, poured, into, chimay, ...",-1
7,"[67, 30, 46, 3700, 243, 67, 7, 14, 81, 6, 69, ...","[1.0, 0.8, 0.6, 0.7, 0.7]","[aroma, :, nice, grapefruity, hoppy, aroma, ,,...",-1
8,"[106, 36, 8820, 40, 4949, 75, 3929, 387, 3526,...","[1.0, 0.6, 1.0, 0.8, 0.9]","[poured, from, taps, at, abc, into, authentic,...",-1
9,"[344, 6, 425, 779, 8, 18, 22, 630, 15338, 18, ...","[0.4, 0.6, 0.4, 0.8, 0.8]","[got, a, 12, pack, of, this, for, $, 10.99, th...",-1


In [27]:
datasets['dev'][0][0]

[202,
 57,
 296,
 148,
 27,
 4,
 294,
 5,
 6,
 2143,
 600,
 1104,
 31,
 8,
 32,
 645,
 158,
 12,
 6,
 1449,
 1455,
 5,
 47,
 1164,
 27,
 4,
 31,
 7,
 6,
 510,
 8,
 224,
 1523,
 280,
 492,
 23,
 466,
 357,
 5,
 67,
 57,
 134,
 4,
 299,
 381,
 155,
 443,
 67,
 23,
 4,
 1348,
 1990,
 7,
 21,
 14,
 6,
 50,
 49629,
 97,
 893,
 5,
 43,
 57,
 47,
 237,
 597,
 63,
 7,
 37,
 47,
 133,
 123,
 12,
 977,
 7,
 343,
 12811,
 44,
 37,
 3,
 266,
 5,
 113,
 8,
 990,
 27,
 4,
 1304,
 7,
 9,
 6,
 79,
 8,
 218,
 670,
 5,
 83,
 57,
 191,
 1732,
 7,
 293,
 6,
 60,
 54,
 5,
 325,
 62,
 7,
 126,
 118,
 5,
 183,
 57,
 4,
 215,
 1990,
 1432,
 44,
 445,
 2093,
 5,
 2462,
 11,
 4,
 310,
 5,
 6,
 46,
 1065,
 1230,
 128,
 5655,
 468,
 1962,
 72,
 12,
 4,
 1621,
 3498,
 2337,
 5]

In [None]:
# the train data seems to be in the form: 

# index2word, labels, word2index