In [116]:
import numpy as np
import pandas as pd
import re
import tqdm
import random
import torch
random.seed(20)

In [117]:
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
import torch.autograd as autograd
import os
import pickle
import sklearn.metrics
from tqdm import tqdm_notebook as tn

In [118]:
from argparse import Namespace

#try cont_lamb=2*(3e-4) and selec_lamb=3e-4, working - sl = 0.001, cl = 2*0.001
args = Namespace(aspect='overall', batch_size=256, class_balance=False,
                 continuity_lambda=0.005, cuda=True,
                 debug_mode=False, dropout=0.2, embedding='glove', embedding_dim=300,
                 epochs=20, filter_num=100, filters=[3, 4, 5], get_rationales=True,
                 gumbel_decay=1e-05, gumbel_temprature=1, hidden_dim=100,
                 init_lr=0.002, model_form='cnn', num_class=5, num_gpus=1,
                 num_layers=1, num_workers=0, objective='cross_entropy', patience=5,
                 results_path='logs/demo_run.results', save_dir='snapshot',
                 selection_lambda=0.0005, snapshot=None, test=True,
                 train=True, tuning_metric='loss', use_as_tagger=False, weight_decay=5e-6)

In [119]:
args_dict = vars(args)

In [120]:
embeddings = np.load('embed.npy')
word_to_indx = pickle.load(open('stoi.pkl', 'rb'))

In [121]:
def get_indices_tensor(text_arr, word_to_indx, max_length):
    '''
    -text_arr: array of word tokens
    -word_to_indx: mapping of word -> index
    -max length of return tokens

    returns tensor of same size as text with each words corresponding index
    '''
    nil_indx = 0
    text_indx = [ word_to_indx[x] if x in word_to_indx else nil_indx for x in text_arr][:max_length]
    if len(text_indx) < max_length:
        text_indx.extend( [nil_indx for _ in range(max_length - len(text_indx))])

    x =  torch.LongTensor([text_indx])

    return x

In [122]:
from abc import ABCMeta

TRAIN_ONLY_ERR_MSG = "{} only supported for train dataset! Instead saw {}"

class AbstractDataset(data.Dataset):
    __metaclass__ = ABCMeta

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,index):
        sample = self.dataset[index]
        return sample

In [123]:
class YelpDataset(AbstractDataset):

    def __init__(self, args, word_to_indx, name, df, max_length=100):
        self.args = args
        self.name = name
        self.dataset = []
        self.word_to_indx  = word_to_indx
        self.max_length = max_length
        self.class_balance = {}
        self.df = df

        if name in ['train', 'dev']:
            data = self.preprocess_data(self.df)
            random.shuffle(data)
            num_train = int(len(data)*.8)
            if name == 'train':
                data = data[:num_train]
            else:
                data = data[num_train:]
        else:
            data = self.preprocess_data(self.df)

        for indx, _sample in tn(enumerate(data)):
            sample = self.processLine(_sample)

            if not sample['y'] in self.class_balance:
                self.class_balance[ sample['y'] ] = 0
            self.class_balance[ sample['y'] ] += 1
            self.dataset.append(sample)

        print ("Class balance", self.class_balance)

    ## Convert one line from yelp dataset to {Text, Tensor, Labels}
    def processLine(self, row):
        text, label = row
        text = " ".join(text.split()[:self.max_length])
        x =  get_indices_tensor(text.split(), self.word_to_indx, self.max_length)
        sample = {'text':text,'x':x, 'y':label}
        return sample
    
    def preprocess_data(self, df):
        processed_data = []
        for indx, sample in enumerate(df.text.values):
            text, label = sample, df['y'][indx]
#             label_name = data['target_names'][label]
            text = re.sub('\W+', ' ', text).lower().strip()
            processed_data.append( (text, label) )
        return processed_data

In [124]:
trn_df, tst_df = pd.read_csv('yelp_review_full_csv/yelp_train.csv'), pd.read_csv('yelp_review_full_csv/yelp_val.csv')

In [125]:
trn_df['y'] = trn_df['y'] - 1
tst_df['y'] = tst_df['y'] - 1

In [126]:
def get_dataset(args, word_to_indx, trn_df, tst_df):
    train = YelpDataset(args, word_to_indx, 'train', trn_df)
    dev = YelpDataset(args, word_to_indx, 'dev', trn_df)
    test = YelpDataset(args, word_to_indx, 'test', tst_df)
    return train, dev, test

In [127]:
train_data, dev_data, test_data = get_dataset(args_dict, word_to_indx, trn_df, tst_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Class balance {1: 7963, 3: 8014, 0: 7975, 2: 8013, 4: 8035}


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Class balance {4: 2019, 0: 2014, 1: 1995, 3: 1970, 2: 2002}


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Class balance {2: 2000, 4: 2000, 1: 2000, 0: 2000, 3: 2000}


In [128]:
results_path_stem = args_dict['results_path'].split('/')[-1].split('.')[0]
args_dict['model_path'] = '{}.pt'.format(os.path.join(args_dict['save_dir'], results_path_stem))

In [129]:
args_dict['model_path']

'snapshot/demo_run.pt'

In [130]:
class CNN(nn.Module):

    def __init__(self, args, max_pool_over_time=False):
        super(CNN, self).__init__()

        self.args = args
        self.layers = []
        for layer in range(args['num_layers']):
            convs = []
            for filt in args['filters']:
                in_channels =  args['embedding_dim'] if layer == 0 else args['filter_num'] * len( args['filters'])
                kernel_size = filt
                new_conv = nn.Conv1d(in_channels=in_channels, out_channels=args['filter_num'], kernel_size=kernel_size)
                self.add_module( 'layer_'+str(layer)+'_conv_'+str(filt), new_conv)
                convs.append(new_conv)

            self.layers.append(convs)

        self.max_pool = max_pool_over_time



    def _conv(self, x):
        layer_activ = x
        for layer in self.layers:
            next_activ = []
            for conv in layer:
                left_pad = conv.kernel_size[0] - 1
                pad_tensor_size = [d for d in layer_activ.size()]
                pad_tensor_size[2] = left_pad
                left_pad_tensor = torch.zeros(pad_tensor_size)
                if self.args['cuda']:
                    left_pad_tensor = left_pad_tensor.cuda()
                padded_activ = torch.cat( (left_pad_tensor, layer_activ), dim=2)
                next_activ.append( conv(padded_activ) )

            # concat across channels
            layer_activ = F.relu( torch.cat(next_activ, 1) )

        return layer_activ


    def _pool(self, relu):
        pool = F.max_pool1d(relu, relu.size(2)).squeeze(-1)
        return pool


    def forward(self, x):
        activ = self._conv(x)
        if self.max_pool:
            activ =  self._pool(activ)
        return activ

In [131]:
def get_train_loader(train_data, args):
    train_loader = data.DataLoader(
        train_data,
        batch_size=args['batch_size'],
        shuffle=True,
        num_workers=args['num_workers'],
        drop_last=False)

    return train_loader

In [132]:
def get_dev_loader(dev_data, args):
    dev_loader = data.DataLoader(
        dev_data,
        batch_size=args['batch_size'],
        shuffle=False,
        num_workers=args['num_workers'],
        drop_last=False)
    return dev_loader

In [133]:
def get_rationales(mask, text):
    if mask is None:
        return text
    masked_text = []
    for i, t in enumerate(text):
        sample_mask = list(mask.data[i])
        original_words = t.split()
        words = [ w if m  > .5 else "_" for w,m in zip(original_words, sample_mask) ]
        masked_sample = " ".join(words)
        masked_text.append(masked_sample)
    return masked_text

In [134]:
def get_optimizer(models, args):
    '''
        -models: List of models (such as Generator, classif, memory, etc)
        -args: experiment level config

        returns: torch optimizer over models
    '''
    params = []
    for model in models:
        params.extend([param for param in model.parameters() if param.requires_grad])
    return torch.optim.Adam(params, lr=args['lr'],  weight_decay=args['weight_decay'], betas=(0.7, 0.99))

In [135]:
def get_hard_mask(z):
    '''
        -z: torch Tensor where each element probablity of element
        being selected
        -args: experiment level config

        returns: A torch variable that is binary mask of z >= .5
    '''
    max_z, ind = torch.max(z, dim=-1)
    masked = torch.ge(z, max_z.unsqueeze(-1)).float()
    del z
    return masked

In [136]:
def get_gen_path(model_path):
    '''
        -model_path: path of encoder model

        returns: path of generator
    '''
    return '{}.gen'.format(model_path)

In [137]:
def gumbel_softmax(input, temperature, cuda):
    noise = torch.rand(input.size())
    noise.add_(1e-9).log_().neg_()
    noise.add_(1e-9).log_().neg_()
    if cuda:
        noise = noise.cuda()
    x = (input + noise) / temperature
    x = F.softmax(x.view(-1,  x.size()[-1]), dim=-1)
    return x.view_as(input)

In [138]:
class Generator(nn.Module):

    def __init__(self, embeddings, args):
        super(Generator, self).__init__()
        vocab_size, hidden_dim = embeddings.shape
        self.embedding_layer = nn.Embedding( vocab_size, hidden_dim)
        self.embedding_layer.weight.data = torch.from_numpy( embeddings )
        self.embedding_layer.weight.requires_grad = False
        self.args = args
        self.cnn = CNN(args, max_pool_over_time = False)    

        self.z_dim = 2

        self.hidden = nn.Linear((len(args['filters'])* args['filter_num']), self.z_dim)
        self.dropout = nn.Dropout(args['dropout'])



    def  __z_forward(self, activ):
        '''
            Returns prob of each token being selected
        '''
        activ = activ.transpose(1,2)
        logits = self.hidden(activ)
        probs = gumbel_softmax(logits, self.args['gumbel_temprature'], self.args['cuda'])
        z = probs[:,:,1]
        return z


    def forward(self, x_indx):
        '''
            Given input x_indx of dim (batch, length), return z (batch, length) such that z
            can act as element-wise mask on x
        '''
        x = self.embedding_layer(x_indx.squeeze(1))
        if self.args['cuda']:
            x = x.cuda()
        x = torch.transpose(x, 1, 2) # Switch X to (Batch, Embed, Length)
        activ = self.cnn(x)
        
        z = self.__z_forward(F.relu(activ))
        mask = self.sample(z)
        return mask, z


    def sample(self, z):
        '''
            Get mask from probablites at each token. Use gumbel
            softmax at train time, hard mask at test time
        '''
        mask = z
        if self.training:
            mask = z
        else:
            ## pointwise set <.5 to 0 >=.5 to 1
            mask = get_hard_mask(z)
        return mask


    def loss(self, mask, x_indx):
        '''
            Compute the generator specific costs, i.e selection cost, continuity cost, and global vocab cost
        '''
#         print(mask)
#         print(mask.shape)
        selection_cost = torch.mean( torch.sum(mask, dim=1) )
        l_padded_mask =  torch.cat( [mask[:,0].unsqueeze(1), mask] , dim=1)
        r_padded_mask =  torch.cat( [mask, mask[:,-1].unsqueeze(1)] , dim=1)
        continuity_cost = torch.mean( torch.sum( torch.abs( l_padded_mask - r_padded_mask ) , dim=1) )
        return selection_cost, continuity_cost

In [139]:
class Encoder(nn.Module):

    def __init__(self, embeddings, args):
        super(Encoder, self).__init__()
        ### Encoder
        self.args = args
        vocab_size, hidden_dim = embeddings.shape
        self.embedding_dim = hidden_dim
        self.embedding_layer = nn.Embedding( vocab_size, hidden_dim)
        self.embedding_layer.weight.data = torch.from_numpy( embeddings )
        self.embedding_layer.weight.requires_grad = True
        self.embedding_fc = nn.Linear( hidden_dim, hidden_dim )
        self.embedding_bn = nn.BatchNorm1d( hidden_dim)

        self.cnn = CNN(args, max_pool_over_time=True)
        self.fc = nn.Linear( len(args['filters'])*args['filter_num'],  args['hidden_dim'])

        self.dropout = nn.Dropout(args['dropout'])
        self.hidden = nn.Linear(args['hidden_dim'], args['num_class'])

    def forward(self, x_indx, mask=None):
        '''
            x_indx:  batch of word indices
            mask: Mask to apply over embeddings for tao ratioanles
        '''
        x = self.embedding_layer(x_indx.squeeze(1))
        if self.args['cuda']:
            x = x.cuda()
        if not mask is None:
            x = x * mask.unsqueeze(-1)
        x = F.relu( self.embedding_fc(x))
        x = self.dropout(x)

        x = torch.transpose(x, 1, 2) # Switch X to (Batch, Embed, Length)
        hidden = self.cnn(x)
        hidden = F.relu( self.fc(hidden) )

        hidden = self.dropout(hidden)
        logit = self.hidden(hidden)
        return logit, hidden

In [140]:
def get_model(args, embeddings, train_data):
    gen   = Generator(embeddings, args)
    model = Encoder(embeddings, args)
    return gen, model
#     else :
#         print('\nLoading model from [%s]...' % args.snapshot)
#         try:
#             gen_path = learn.get_gen_path(args.snapshot)
#             if os.path.exists(gen_path):
#                 gen   = torch.load(gen_path)
#             model = torch.load(args.snapshot)
#         except :
#             print("Sorry, This snapshot doesn't exist."); exit()

#     if args['num_gpus'] > 1:
#         model = nn.DataParallel(model, device_ids=range(args['num_gpus']))

#         if not gen is None:
#             gen = nn.DataParallel(gen,
#                                     device_ids=range(args['num_gpus']))
#     return gen, model

In [141]:
# model
gen, model = get_model(args_dict, embeddings, train_data)

In [142]:
gen

Generator(
  (embedding_layer): Embedding(400001, 300)
  (cnn): CNN(
    (layer_0_conv_3): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (layer_0_conv_4): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (layer_0_conv_5): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (hidden): Linear(in_features=300, out_features=2, bias=True)
  (dropout): Dropout(p=0.2)
)

In [143]:
model

Encoder(
  (embedding_layer): Embedding(400001, 300)
  (embedding_fc): Linear(in_features=300, out_features=300, bias=True)
  (embedding_bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (cnn): CNN(
    (layer_0_conv_3): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (layer_0_conv_4): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (layer_0_conv_5): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=100, bias=True)
  (dropout): Dropout(p=0.2)
  (hidden): Linear(in_features=100, out_features=5, bias=True)
)

In [144]:
def collate_epoch_stat(stat_dict, epoch_details, mode, args):
    '''
        Update stat_dict with details from epoch_details and create
        log statement

        - stat_dict: a dictionary of statistics lists to update
        - epoch_details: list of statistics for a given epoch
        - mode: train, dev or test
        - args: model run configuration

        returns:
        -stat_dict: updated stat_dict with epoch details
        -log_statement: log statement sumarizing new epoch

    '''
    log_statement_details = ''
    for metric in epoch_details:
        loss = epoch_details[metric]
        stat_dict['{}_{}'.format(mode, metric)].append(loss)

        log_statement_details += ' -{}: {}'.format(metric, loss)

    log_statement = '\n {} - {}\n--'.format(
        args['objective'], log_statement_details )

    return stat_dict, log_statement

In [145]:
def get_metrics(preds, golds):
    metrics = {}

    metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true=golds, y_pred=preds)
    metrics['precision'] = sklearn.metrics.precision_score(y_true=golds, y_pred=preds, average="weighted")
    metrics['recall'] = sklearn.metrics.recall_score(y_true=golds,y_pred=preds, average="weighted")
    metrics['f1'] = sklearn.metrics.f1_score(y_true=golds,y_pred=preds, average="weighted")

    metrics['mse'] = "NA"

    return metrics

In [146]:
def init_metrics_dictionary(modes):
    '''
    Create dictionary with empty array for each metric in each mode
    '''
    epoch_stats = {}
    metrics = [
        'loss', 'obj_loss', 'k_selection_loss',
        'k_continuity_loss', 'accuracy', 'precision', 'recall', 'f1', 'mse']
    for metric in metrics:
        for mode in modes:
            key = "{}_{}".format(mode, metric)
            epoch_stats[key] = []

    return epoch_stats

In [147]:
def train_model(train_data, dev_data, model, gen, args):
    '''
    Train model and tune on dev set. If model doesn't improve dev performance within args.patience
    epochs, then halve the learning rate, restore the model to best and continue training.

    At the end of training, the function will restore the model to best dev version.

    returns epoch_stats: a dictionary of epoch level metrics for train and test
    returns model : best model from this call to train
    '''

    if args['cuda']:
        model = model.cuda()
        gen = gen.cuda()

    args['lr'] = args['init_lr']
    optimizer = get_optimizer([model, gen], args)

    num_epoch_sans_improvement = 0
    epoch_stats = init_metrics_dictionary(modes=['train', 'dev'])
    step = 0
    tuning_key = "dev_{}".format(args['tuning_metric'])

    train_loader = get_train_loader(train_data, args)
    dev_loader = get_dev_loader(dev_data, args)

    for epoch in range(1, args['epochs'] + 1):

        print("-------------\nEpoch {}:\n".format(epoch))
        for mode, dataset, loader in [('Train', train_data, train_loader), ('Dev', dev_data, dev_loader)]:
            train_model = mode == 'Train'
            print('{}'.format(mode))
            key_prefix = mode.lower()
            epoch_details, step, _, _, _, _ = run_epoch(
                data_loader=loader,
                train_model=train_model,
                model=model,
                gen=gen,
                optimizer=optimizer,
                step=step,
                args=args)

            epoch_stats, log_statement = collate_epoch_stat(epoch_stats, epoch_details, key_prefix, args)

            # Log  performance
            print(log_statement)


        # Save model if beats best dev
        best_func = min if args['tuning_metric'] == 'loss' else max
        if best_func(epoch_stats[tuning_key]) == epoch_stats[tuning_key][-1]:
            num_epoch_sans_improvement = 0
            if not os.path.isdir(args['save_dir']):
                os.makedirs(args['save_dir'])
            # Subtract one because epoch is 1-indexed and arr is 0-indexed
            epoch_stats['best_epoch'] = epoch - 1
            torch.save(model, args['model_path'])
            torch.save(gen, get_gen_path(args['model_path']))
        else:
            num_epoch_sans_improvement += 1

        if not train_model:
            print('---- Best Dev {} is {:.4f} at epoch {}'.format(
                args['tuning_metric'],
                epoch_stats[tuning_key][epoch_stats['best_epoch']],
                epoch_stats['best_epoch'] + 1))

        if num_epoch_sans_improvement >= args['patience']:
            print("Reducing learning rate")
            num_epoch_sans_improvement = 0
            model.cpu()
            gen.cpu()
            model = torch.load(args['model_path'])
            gen = torch.load(get_gen_path(args['model_path']))

            if args['cuda']:
                model = model.cuda()
                gen   = gen.cuda()
            args['lr'] *= .5
            optimizer = get_optimizer([model, gen], args)

    return epoch_stats, model, gen

In [148]:
def test_model(test_data, model, gen, args):
    '''
    Run model on test data, and return loss, accuracy.
    '''
    if args['cuda']:
        model = model.cuda()
        gen = gen.cuda()

    test_loader = torch.utils.data.DataLoader(
        test_data,
        batch_size=args['batch_size'],
        shuffle=False,
        num_workers=args['num_workers'],
        drop_last=False)

    test_stats = init_metrics_dictionary(modes=['test'])

    mode = 'Test'
    train_model = False
    key_prefix = mode.lower()
    print("-------------\nTest")
    epoch_details, _, losses, preds, golds, rationales = run_epoch(
        data_loader=test_loader,
        train_model=train_model,
        model=model,
        gen=gen,
        optimizer=None,
        step=None,
        args=args)

    test_stats, log_statement = collate_epoch_stat(test_stats, epoch_details, 'test', args)
    test_stats['losses'] = losses
    test_stats['preds'] = preds
    test_stats['golds'] = golds
    test_stats['rationales'] = rationales

    print(log_statement)

    return test_stats

In [149]:
def run_epoch(data_loader, train_model, model, gen, optimizer, step, args):
    '''
    Train model for one pass of train data, and return loss, acccuracy
    '''
    eval_model = not train_model
    data_iter = data_loader.__iter__()

    losses = []
    obj_losses = []
    k_selection_losses = []
    k_continuity_losses = []
    preds = []
    golds = []
    losses = []
    texts = []
    rationales = []

    if train_model:
        model.train()
        gen.train()
    else:
        gen.eval()
        model.eval()

    num_batches_per_epoch = len(data_iter)
    if train_model:
        num_batches_per_epoch = min(len(data_iter), 10000)

    for _ in tn(range(num_batches_per_epoch)):
        batch = data_iter.next()
        if train_model:
            step += 1
            if  step % 100 == 0 or args['debug_mode']:
                args['gumbel_temprature'] = max( np.exp((step+1) *-1* args['gumbel_decay']), .05)

        x_indx = batch['x']
        text = batch['text']
        y = batch['y']

        if args['cuda']:
            x_indx, y = x_indx.cuda(), y.cuda()

        if train_model:
            optimizer.zero_grad()

        mask, z = gen(x_indx)

        logit, _ = model(x_indx, mask=mask)

        loss = get_loss(logit, y)
        obj_loss = loss.item()
        
#         print('mask in run_epoch', mask)

        selection_cost, continuity_cost = gen.loss(mask, x_indx)
    
    
#         print('check loss')
#         print(loss)
        loss += args['selection_lambda'] * selection_cost
        loss += args['continuity_lambda'] * continuity_cost
#         print(loss)
#         print('is it different')
#         print(train_model)
#         print(loss)
#         print(loss.item())
#         print(obj_loss)

        if train_model:
            loss.backward()
            optimizer.step()
            
#         print(obj_loss,'\n',selection_cost.item(),'\n',args['selection_lambda']*selection_cost.item(),'\n',
#              loss.item())

        k_selection_losses.append( selection_cost.item() )
        k_continuity_losses.append( continuity_cost.item() )

        obj_losses.append(obj_loss)
        losses.append( loss.item() )
        batch_softmax = F.softmax(logit, dim=-1).cpu()
        preds.extend(torch.max(batch_softmax, 1)[1].view(y.size()).data.numpy())

        texts.extend(text)
        rationales.extend(get_rationales(mask, text))

        golds.extend(batch['y'].numpy())


    epoch_metrics = get_metrics(preds, golds)

    epoch_stat = {
        'loss' : np.mean(losses),
        'obj_loss': np.mean(obj_losses)
    }

    for metric_k in epoch_metrics.keys():
        epoch_stat[metric_k] = epoch_metrics[metric_k]

    epoch_stat['k_selection_loss'] = np.mean(k_selection_losses)
    epoch_stat['k_continuity_loss'] = np.mean(k_continuity_losses)

    return epoch_stat, step, losses, preds, golds, rationales

In [150]:
def get_loss(logit,y):
    loss = F.cross_entropy(logit, y)
    return loss

In [151]:
epoch_stats, model, gen = train_model(train_data, dev_data, model, gen, args_dict)

-------------
Epoch 1:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 1.4733342053783927 -obj_loss: 1.4338079850385144 -accuracy: 0.337525 -precision: 0.3368757121459717 -recall: 0.337525 -f1: 0.330761923920703 -mse: NA -k_selection_loss: 6.541971950773981 -k_continuity_loss: 7.251047477600681
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.4102115839719773 -obj_loss: 1.399069482088089 -accuracy: 0.4 -precision: 0.38262315103378175 -recall: 0.4 -f1: 0.3738498887971107 -mse: NA -k_selection_loss: 1.1123046875 -k_continuity_loss: 2.1171875
--


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


---- Best Dev loss is 1.4102 at epoch 1
-------------
Epoch 2:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 1.2515585301028695 -obj_loss: 1.2052943114262478 -accuracy: 0.475625 -precision: 0.4658971576909247 -recall: 0.475625 -f1: 0.4674891629788807 -mse: NA -k_selection_loss: 6.889938545834487 -k_continuity_loss: 8.563850293493575
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.427430459856987 -obj_loss: 1.4165437370538712 -accuracy: 0.3958 -precision: 0.37877256134352266 -recall: 0.3958 -f1: 0.3645322227875685 -mse: NA -k_selection_loss: 1.0869140625 -k_continuity_loss: 2.06865234375
--
---- Best Dev loss is 1.4102 at epoch 1
-------------
Epoch 3:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 1.182549819824802 -obj_loss: 1.129928806025511 -accuracy: 0.5122 -precision: 0.5038241688171345 -recall: 0.5122 -f1: 0.5058378612088927 -mse: NA -k_selection_loss: 8.469334735991849 -k_continuity_loss: 9.67726825301055
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3603055328130722 -obj_loss: 1.3471346825361252 -accuracy: 0.4223 -precision: 0.41066485239820094 -recall: 0.4223 -f1: 0.38033915111817274 -mse: NA -k_selection_loss: 1.45009765625 -k_continuity_loss: 2.48916015625
--
---- Best Dev loss is 1.3603 at epoch 3
-------------
Epoch 4:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 1.1190289243771012 -obj_loss: 1.065672202474752 -accuracy: 0.538625 -precision: 0.5315447621888731 -recall: 0.538625 -f1: 0.5336000819293313 -mse: NA -k_selection_loss: 8.840664274373632 -k_continuity_loss: 9.787278788864233
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.343937376141548 -obj_loss: 1.3295686185359954 -accuracy: 0.4297 -precision: 0.41801134679387253 -recall: 0.4297 -f1: 0.4091396675407832 -mse: NA -k_selection_loss: 1.6701171875 -k_continuity_loss: 2.70673828125
--
---- Best Dev loss is 1.3439 at epoch 4
-------------
Epoch 5:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 1.0573324557322605 -obj_loss: 1.0049554238653486 -accuracy: 0.5715 -precision: 0.5660816855444054 -recall: 0.5715 -f1: 0.567850771043494 -mse: NA -k_selection_loss: 8.800179584770445 -k_continuity_loss: 9.595388108757652
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3431922376155854 -obj_loss: 1.3271587431430816 -accuracy: 0.4383 -precision: 0.42997886024660475 -recall: 0.4383 -f1: 0.40813170085941364 -mse: NA -k_selection_loss: 1.956640625 -k_continuity_loss: 3.01103515625
--
---- Best Dev loss is 1.3432 at epoch 5
-------------
Epoch 6:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.9954919587275025 -obj_loss: 0.9440665890456764 -accuracy: 0.6028 -precision: 0.5987861538289272 -recall: 0.6028 -f1: 0.600281270147063 -mse: NA -k_selection_loss: 8.59940879967562 -k_continuity_loss: 9.42513253886229
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3035925954580307 -obj_loss: 1.287042850255966 -accuracy: 0.4553 -precision: 0.44174598444777846 -recall: 0.4553 -f1: 0.4376743300905228 -mse: NA -k_selection_loss: 2.02431640625 -k_continuity_loss: 3.10751953125
--
---- Best Dev loss is 1.3036 at epoch 6
-------------
Epoch 7:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.9325506979492819 -obj_loss: 0.881525541187092 -accuracy: 0.633675 -precision: 0.6311275960022406 -recall: 0.633675 -f1: 0.6321438155225793 -mse: NA -k_selection_loss: 8.617305710057543 -k_continuity_loss: 9.34330167284437
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3146589428186417 -obj_loss: 1.2965563982725143 -accuracy: 0.4623 -precision: 0.45860090648088936 -recall: 0.4623 -f1: 0.45034075575490207 -mse: NA -k_selection_loss: 2.3759765625 -k_continuity_loss: 3.38291015625
--
---- Best Dev loss is 1.3036 at epoch 6
-------------
Epoch 8:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.8780248806734753 -obj_loss: 0.8265505201497655 -accuracy: 0.659175 -precision: 0.6570368215429787 -recall: 0.659175 -f1: 0.65788275351558 -mse: NA -k_selection_loss: 8.806262074002795 -k_continuity_loss: 9.414246152161033
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3647727757692336 -obj_loss: 1.3473452270030974 -accuracy: 0.4577 -precision: 0.4479420294962227 -recall: 0.4577 -f1: 0.4378129617823733 -mse: NA -k_selection_loss: 2.2779296875 -k_continuity_loss: 3.25771484375
--
---- Best Dev loss is 1.3036 at epoch 6
-------------
Epoch 9:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.8187422619503775 -obj_loss: 0.7671925281263461 -accuracy: 0.68415 -precision: 0.6827777889396012 -recall: 0.68415 -f1: 0.6833347392794085 -mse: NA -k_selection_loss: 8.854461672959054 -k_continuity_loss: 9.42450105460586
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3022212237119675 -obj_loss: 1.2817735224962234 -accuracy: 0.4894 -precision: 0.48241222234753134 -recall: 0.4894 -f1: 0.4749909443931976 -mse: NA -k_selection_loss: 2.85341796875 -k_continuity_loss: 3.80419921875
--
---- Best Dev loss is 1.3022 at epoch 9
-------------
Epoch 10:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.770709046133005 -obj_loss: 0.7184112580718508 -accuracy: 0.706 -precision: 0.7046191073960522 -recall: 0.706 -f1: 0.7051488874740459 -mse: NA -k_selection_loss: 9.021989682677445 -k_continuity_loss: 9.557358674942309
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.320826095342636 -obj_loss: 1.3011505603790283 -accuracy: 0.4852 -precision: 0.4767757145839103 -recall: 0.4852 -f1: 0.47203448575351775 -mse: NA -k_selection_loss: 2.72412109375 -k_continuity_loss: 3.6626953125
--
---- Best Dev loss is 1.3022 at epoch 9
-------------
Epoch 11:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.7268949693934933 -obj_loss: 0.6747489404526485 -accuracy: 0.724025 -precision: 0.7229161158594881 -recall: 0.724025 -f1: 0.7233149362178742 -mse: NA -k_selection_loss: 8.978279280814396 -k_continuity_loss: 9.531377853101985
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3716538906097413 -obj_loss: 1.352220633625984 -accuracy: 0.4837 -precision: 0.4773133811806598 -recall: 0.4837 -f1: 0.4625689630060535 -mse: NA -k_selection_loss: 2.67509765625 -k_continuity_loss: 3.619140625
--
---- Best Dev loss is 1.3022 at epoch 9
-------------
Epoch 12:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.6889667419870947 -obj_loss: 0.6372079775211917 -accuracy: 0.739425 -precision: 0.7382837988810009 -recall: 0.739425 -f1: 0.738688144417694 -mse: NA -k_selection_loss: 9.019929803860416 -k_continuity_loss: 9.449760485606589
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3017937004566194 -obj_loss: 1.2794080674648285 -accuracy: 0.5121 -precision: 0.5049631357339824 -recall: 0.5121 -f1: 0.5041218472379583 -mse: NA -k_selection_loss: 3.3220703125 -k_continuity_loss: 4.144921875
--
---- Best Dev loss is 1.3018 at epoch 12
-------------
Epoch 13:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.6537174077550317 -obj_loss: 0.6019744983144627 -accuracy: 0.75565 -precision: 0.7550125709328027 -recall: 0.75565 -f1: 0.7552458074881585 -mse: NA -k_selection_loss: 9.042353447835158 -k_continuity_loss: 9.444346822750797
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.336856210231781 -obj_loss: 1.3126942962408066 -accuracy: 0.5249 -precision: 0.5239784753953408 -recall: 0.5249 -f1: 0.5072094049114627 -mse: NA -k_selection_loss: 3.5533203125 -k_continuity_loss: 4.47705078125
--
---- Best Dev loss is 1.3018 at epoch 12
-------------
Epoch 14:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.6121916431150619 -obj_loss: 0.5603353569082393 -accuracy: 0.771375 -precision: 0.7707039188344071 -recall: 0.771375 -f1: 0.7708887573332898 -mse: NA -k_selection_loss: 9.065742146437335 -k_continuity_loss: 9.464683022468712
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.2930368021130563 -obj_loss: 1.2682506799697877 -accuracy: 0.5375 -precision: 0.530828182915489 -recall: 0.5375 -f1: 0.528781080720964 -mse: NA -k_selection_loss: 3.7509765625 -k_continuity_loss: 4.58212890625
--
---- Best Dev loss is 1.2930 at epoch 14
-------------
Epoch 15:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.5813796258276436 -obj_loss: 0.5288666646192028 -accuracy: 0.7854 -precision: 0.78477819849882 -recall: 0.7854 -f1: 0.7849771869972656 -mse: NA -k_selection_loss: 9.188118558021108 -k_continuity_loss: 9.583780792868062
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3360157579183578 -obj_loss: 1.3118525624275208 -accuracy: 0.5351 -precision: 0.5304313186246798 -recall: 0.5351 -f1: 0.5290534473580111 -mse: NA -k_selection_loss: 3.5353515625 -k_continuity_loss: 4.4791015625
--
---- Best Dev loss is 1.2930 at epoch 14
-------------
Epoch 16:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.5583837846661829 -obj_loss: 0.505941416427588 -accuracy: 0.796775 -precision: 0.7963834559995233 -recall: 0.796775 -f1: 0.7964850457040819 -mse: NA -k_selection_loss: 9.147358122904588 -k_continuity_loss: 9.57373795843428
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3046487987041473 -obj_loss: 1.2794938296079637 -accuracy: 0.5384 -precision: 0.533728976039814 -recall: 0.5384 -f1: 0.5320842095025089 -mse: NA -k_selection_loss: 3.8431640625 -k_continuity_loss: 4.6466796875
--
---- Best Dev loss is 1.2930 at epoch 14
-------------
Epoch 17:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.5276646156599567 -obj_loss: 0.4755971179266644 -accuracy: 0.80905 -precision: 0.8086851777086328 -recall: 0.80905 -f1: 0.8087581615008012 -mse: NA -k_selection_loss: 9.180311002549093 -k_continuity_loss: 9.495468242912535
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.2886425256729126 -obj_loss: 1.2604636013507844 -accuracy: 0.5736 -precision: 0.5718158848577225 -recall: 0.5736 -f1: 0.5647967036235694 -mse: NA -k_selection_loss: 4.2865234375 -k_continuity_loss: 5.20712890625
--
---- Best Dev loss is 1.2886 at epoch 17
-------------
Epoch 18:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.5001913174322457 -obj_loss: 0.44764456532563374 -accuracy: 0.820075 -precision: 0.8198530944552743 -recall: 0.820075 -f1: 0.8198399598547816 -mse: NA -k_selection_loss: 9.26678012434844 -k_continuity_loss: 9.582672781245723
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.4067144215106964 -obj_loss: 1.380179750919342 -accuracy: 0.5551 -precision: 0.5499673077724964 -recall: 0.5551 -f1: 0.5474458976363168 -mse: NA -k_selection_loss: 4.0419921875 -k_continuity_loss: 4.902734375
--
---- Best Dev loss is 1.2886 at epoch 17
-------------
Epoch 19:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.4867874702830223 -obj_loss: 0.4338845863084125 -accuracy: 0.830425 -precision: 0.830176141628456 -recall: 0.830425 -f1: 0.830193111061886 -mse: NA -k_selection_loss: 9.297609213810818 -k_continuity_loss: 9.650815696473334
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3979893565177917 -obj_loss: 1.3712874472141265 -accuracy: 0.5554 -precision: 0.5544352143746394 -recall: 0.5554 -f1: 0.543731403833777 -mse: NA -k_selection_loss: 4.08837890625 -k_continuity_loss: 4.93154296875
--
---- Best Dev loss is 1.2886 at epoch 17
-------------
Epoch 20:

Train


HBox(children=(IntProgress(value=0, max=157), HTML(value='')))



 cross_entropy -  -loss: 0.46808143273280683 -obj_loss: 0.4150926058839081 -accuracy: 0.836675 -precision: 0.8364134764379801 -recall: 0.836675 -f1: 0.8364744974413115 -mse: NA -k_selection_loss: 9.30441295720969 -k_continuity_loss: 9.667324126905696
--
Dev


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))



 cross_entropy -  -loss: 1.3142448127269746 -obj_loss: 1.2844706356525422 -accuracy: 0.5909 -precision: 0.5945970391032714 -recall: 0.5909 -f1: 0.5831586922369371 -mse: NA -k_selection_loss: 4.67041015625 -k_continuity_loss: 5.48779296875
--
---- Best Dev loss is 1.2886 at epoch 17


In [None]:
args_dict['selection_lambda']=0.04
args_dict['continuity_lambda']=3 * args_dict['selection_lambda']
args_dict['batch_size'] = 256

In [None]:
epoch_stats, model, gen = train_model(train_data, dev_data, model, gen, args_dict)

In [94]:
# Restore model to best dev performance
if os.path.exists(args_dict['model_path']):
    model.cpu()
    model = torch.load(args_dict['model_path'])
    gen.cpu()
    gen = torch.load(get_gen_path(args_dict['model_path']))

In [152]:
args_dict['epoch_stats'] = epoch_stats
save_path = args_dict['results_path']
print("Save train/dev results to", save_path)

Save train/dev results to logs/demo_run.results


In [153]:
pickle.dump(args_dict, open(save_path,'wb') )

In [154]:
model = model.cuda()
gen = gen.cuda()

In [155]:
args_dict['batch_size'] = 2

In [156]:
test_loader = torch.utils.data.DataLoader(
    test_data,
    batch_size=args_dict['batch_size'],
    shuffle=False,
    num_workers=args_dict['num_workers'],
    drop_last=False)

In [157]:
eval_model = True
# train_model = False
key_prefix = 'test'
data_iter = test_loader.__iter__()

In [158]:
gen.eval()
model.eval()

Encoder(
  (embedding_layer): Embedding(400001, 300)
  (embedding_fc): Linear(in_features=300, out_features=300, bias=True)
  (embedding_bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (cnn): CNN(
    (layer_0_conv_3): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (layer_0_conv_4): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (layer_0_conv_5): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=100, bias=True)
  (dropout): Dropout(p=0.2)
  (hidden): Linear(in_features=100, out_features=5, bias=True)
)

In [249]:
b = data_iter.next()

In [250]:
x_indx, y, text = b['x'].cuda(), b['y'].cuda(), b['text']

In [251]:
mask, z = gen(x_indx)

In [252]:
get_rationales(mask, text)

['_ _ incredibly impressed with _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ excellent service was great and _ _ _ awesome even _ _ _ _ _ _ somewhat questionable _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ awesome you _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ awesome very knowledgable _ _ _ _ _ _ _',
 '_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ rude gestures _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _']

In [253]:
text

['we were incredibly impressed with our first embarrassed to admit experience at binks we have been talking about going for a long time and finally tried them for sunday brunch the food was excellent service was great and the setting is awesome even though binks is located in a somewhat questionable neighborhood and this is coming from a central phoenix resident on a semi busy street the patio is awesome you don t really notice the surroundings even while playing a game of croquet which we did the server was awesome very knowledgable and even got a chef to open',
 'i came to place an order for a birthday cake the open sign was on so i proceed to open the door the few people that were inside started making some gestures i didn t understand then a man came closer and with very rude gestures pointed a some small later that said closed on wednesday nit would only take him a second to open the door and say we are close today i apologize for the inconvenience ni simply drove two blocks down 

In [254]:
logit, _ = model(x_indx, mask=mask)

In [255]:
batch_softmax = F.softmax(logit, dim=-1).cpu()

In [256]:
torch.max(batch_softmax, 1)[1].view(y.size()).data.numpy()

array([4, 1])

In [257]:
b['y']

tensor([4, 0])

In [72]:
get_rationales(mask, text)

['_ _ is amazing me _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ unbelievably horrible _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _',
 '_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ always glad _ _ _ _ _ _ excellent instructor _ upbeat _ _ _ _ _ _ _ encouragement _ _ _ _ _ _ _ _ pretty neat and super easy _ _ _ _ _ _ _ _ _ _ _ _ _ _ _']

In [73]:
text

['the food is amazing me and my wife have been clients since we where children recently the service has been unbelievably horrible last night we visited the restaurant and there was no one at the front door to sit us down we waited with another 3 parties for like 8 minutes after going into the back of the restaurant all the employees where sitting behind the bar counting their tips it only went down hill from there i had to',
 'as a novice to pilates i was a bit intimidated by the reformer accordingly my first few classes were brutal but in the weeks since i ve become much stronger and am always glad when i attend christina is an excellent instructor friendly upbeat and quick to make corrections and provide encouragement n nthe online scheduling system at imx is pretty neat and super easy to use class sizes are small however i almost always get into the class i']