In [1]:
def n_grams(tokens, n=1):
    r"""Returns an itirator over the `n`-grams given a `listTokens`.

    Args:
        tokens (list): List of tokens.
        n (int,optional): N in n-grams.

    Returns:
        Iterator over the n-grams.
    """
    shiftToken = lambda i: (el for j,el in enumerate(tokens) if j>=i)
    shiftedTokens = (shiftToken(i) for i in range(n))
    tupleNGrams = zip(*shiftedTokens)
    return (" ".join(i) for i in tupleNGrams)

In [2]:
list(n_grams("test qui ok yann".split()))

['test', 'qui', 'ok', 'yann']

In [1]:
import sys
sys.path.append('..')

import torch
import numpy as np
from torch.utils.data import DataLoader

from evaluate.load.helpers import *
from evaluate.load.dataset import *
from evaluate.pipeline.model import *
from evaluate.pipeline.trainer import *
from evaluate.pipeline.helpers import *
#pip install git+https://github.com/ncullen93/torchsample.git

from torchsample.modules import ModuleTrainer

In [2]:
import argparse

In [12]:
def check_pair(parser,arg,name,types=(int,int)):
    if arg[0] == "None":
        arg = None
    if arg is not None and len(arg) != 2:
        raise parser.error("{} has to be None or of length 2.".format(name))
    if arg is not None:
        try:
            arg[0] = types[0](arg[0])
            arg[1] = types[1](arg[1])
        except ValueError:
            raise parser.error("{} should be of type {}".format(name,types))
    return arg
    

def parse_arguments(l):
    """Parses the arguments from the command line."""
    parser = argparse.ArgumentParser(description="PyTorch implementation and evaluation of HashEmbeddings, which uses multiple hashes to efficiently approximate an Embedding layer.")
    
    # Dataset options
    data = parser.add_argument_group('Dataset options')
    datasets = ['ag','amazon','dbpedia','sogou','yahoo','yelp','yelp-polarity']
    data.add_argument('-d','--dataset', help='path to training data csv', default='ag', choices=datasets)

    # Learning options
    learn = parser.add_argument_group('Learning options')
    learn.add_argument('--no-shuffle', action='store_true', default=False, help='Disables shuffling batches when training.')
    learn.add_argument('--no-checkpoint', action='store_true', default=False, help='Disables model checkpoint. I.e saving best model based on validation loss.')
    learn.add_argument('--val-loss-callback', action='store_true', default=False, help='Whether should monitor the callbacks (early stopping ? decrease LR on plateau/ ... on the loss rather than accuracy on validation set.')
    learn.add_argument('-e','--epochs', type=int, default=300, help='Maximum number of epochs to run for.')
    learn.add_argument('-b','--batch-size', type=int, default=64, help='Batch size for training.')
    learn.add_argument('-v','--validation-size', type=float, default=0.05, help='Percentage of training set to use as validation.')
    learn.add_argument('-s','--seed', type=int, default=123, help='Random seed.')
    learn.add_argument('-p','--patience', type=int, default=10, help='Patience if early stopping. None means no early stopping.')
    learn.add_argument('-V','--verbose', type=int, default=3, help='Verbosity in [0,3].')
    learn.add_argument('-P','--plateau-reduce-lr', metavar=('PATIENCE','FACTOR'), nargs='*', default=[5,0.5], help='If specified, if loss did not improve since PATIENCE epochs then multiply lr by FACTOR. [None,None] means no reducing of lr on plateau.')
    
    # Device options
    device = parser.add_argument_group('Device options')
    device.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training, even when have one.')
    device.add_argument('-w','--num-workers', type=int, default=0, help='Number of subprocesses used for data loading.')

    # Featurizing options
    feature = parser.add_argument_group('Featurizing options')
    feature.add_argument('--dictionnary', action='store_true', default=False, help='Uses a dictionnary.')
    feature.add_argument('-g','--ngrams-range', metavar=('MIN_NGRAM','MAX_NGRAM'), nargs='*', default=[1,9], help='Range of ngrams to generate. ngrams in [minNgram,maxNgram[.')
    feature.add_argument('-f','--num-features-range', metavar=('MIN_FATURES','MAX_FATURES'), nargs='*', default=[4,100], help='If specified, during training each phrase will have a random number of features in range [minFeatures,maxFeatures[. None if take all.')

    # Embedding options
    embedding = parser.add_argument_group('Embedding options')
    embedding.add_argument('--no-hashembed', action='store_true', default=False, help='Uses the default embedding.')
    embedding.add_argument('--append-weight', action='store_true', default=False, help='Whether to append the importance parameters.')
    embedding.add_argument('-D','--dim', type=int, default=20, help='Dimension of word vectors. Higher improves downstream task for fixed vocabulary size.')
    embedding.add_argument('-B','--num-buckets', type=int, default=10**6, help='Number of buckets in the shared embedding table. Higher improves approximation quality.')
    embedding.add_argument('-N','--num-embeding', type=int, default=10**7, help='Number of rows in the importance matrix. Approximate the number of rows in a usual embedding. Higher will increase possible vocabulary size.')
    embedding.add_argument('-H','--num-hash', type=int, default=2, help='Number of different hashes to use. Higher improves approximation quality.')

    args = parser.parse_args(l)
    args.plateau_reduce_lr = check_pair(parser,args.plateau_reduce_lr,"plateau-reduce-lr",types=(int,float))
    args.ngrams_range = check_pair(parser,args.ngrams_range,"ngrams-range")
    feature.num_features_range = check_pair(parser,args.num_features_range,"num-features-range")

    return args

def main(args):
    """Simply redirrcts to the correct function.""" 
    
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(vars(args))
    return args
    
def run(s=""):
    args = parse_arguments(s.split())
    return main(args)

In [13]:
args = run('-N 100000 -B 10000 -f 4 100 -g 1 4 -b 32 -e 17 -p 5 -P 0 0.5 -s 1')

{'dataset': 'ag', 'no_shuffle': False, 'no_checkpoint': False, 'val_loss_callback': False, 'epochs': 17, 'batch_size': 32, 'validation_size': 0.05, 'seed': 1, 'patience': 5, 'verbose': 3, 'plateau_reduce_lr': [0, 0.5], 'no_cuda': False, 'num_workers': 0, 'dictionnary': False, 'ngrams_range': [1, 4], 'num_features_range': [4, 100], 'no_hashembed': False, 'append_weight': False, 'dim': 20, 'num_buckets': 10000, 'num_embeding': 100000, 'num_hash': 2, 'cuda': False}


In [14]:
import time

In [15]:
%%time
np.random.seed(args.seed)
torch.manual_seed(args.seed)

print("-------------------------------------------------")
print("Ran on {}".format(time.strftime("%Y-%m-%d %H:%M")))
print()

print('Parameters: {}'.format(vars(args)))
print()

# PREPARES DATA
print('Prepares data ...')
train, valid, test = train_valid_test_datasets(args.dataset,
                                              validSize=args.validation_size,
                                              isHashingTrick = not args.dictionnary,
                                              nFeaturesRange = args.num_features_range,
                                              ngramRange = args.ngrams_range,
                                              seed = args.seed,
                                              num_words = args.num_embeding,
                                              specificArgs = {'dictionnary': ['num_words']})

num_classes = len(train.classes)
train = DataLoader(dataset=train, batch_size=args.batch_size, shuffle=not args.no_shuffle)
valid = DataLoader(dataset=valid, batch_size=args.batch_size, shuffle=not args.no_shuffle)
test = DataLoader(dataset=test, batch_size=args.batch_size, shuffle=not args.no_shuffle)

# PREPARES MODEL
print('Prepares model ...')
model = ModelNoDict(args.num_embeding,
                    args.dim,
                    num_classes,
                    isHash=not args.no_hashembed,
                    num_buckets=args.num_buckets,
                    append_weight=args.append_weight)

-------------------------------------------------
Ran on 2018-01-22 19:16

Parameters: {'dataset': 'ag', 'no_shuffle': False, 'no_checkpoint': False, 'val_loss_callback': False, 'epochs': 17, 'batch_size': 32, 'validation_size': 0.05, 'seed': 1, 'patience': 5, 'verbose': 3, 'plateau_reduce_lr': [0, 0.5], 'no_cuda': False, 'num_workers': 0, 'dictionnary': False, 'ngrams_range': [1, 4], 'num_features_range': [4, 100], 'no_hashembed': False, 'append_weight': False, 'dim': 20, 'num_buckets': 10000, 'num_embeding': 100000, 'num_hash': 2, 'cuda': False}

Prepares data ...
Prepares model ...
CPU times: user 2.19 s, sys: 31.9 ms, total: 2.23 s
Wall time: 2.23 s


In [16]:
trainer = ModuleTrainer(model)

In [17]:
from torchsample.modules import ModuleTrainer
from torchsample.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from torchsample.initializers import XavierUniform
from torchsample.metrics import CategoricalAccuracy

In [18]:
args.val_loss_callback = False

In [19]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
callbacks = []
callbackMetric = "val_loss" if args.val_loss_callback else "val_acc_metric"
if args.patience is not None:
    callbacks.append(EarlyStopping(patience=args.patience,monitor=callbackMetric))
if args.plateau_reduce_lr is not None:
    callbacks.append(ReduceLROnPlateau(factor=args.plateau_reduce_lr[1], patience=args.plateau_reduce_lr[0],monitor=callbackMetric))
if not args.no_checkpoint:
    callbacks.append(ModelCheckpoint('./', save_best_only=True, max_save=1,monitor=callbackMetric,verbose=3))
             
#initializers = [XavierUniform(bias=False, module_filter='fc*')]
metrics = [CategoricalAccuracy()]

trainer.compile(loss=loss,
                optimizer=optimizer,
                callbacks=callbacks,
                metrics=metrics)

In [20]:
%%time
trainer.fit_loader(train,
                   val_loader=valid,
                   num_epoch=args.epochs,
                   verbose=args.verbose,
                   cuda_device=0 if args.cuda else -1)

Epoch 1/17: : 298 batches [00:07, 28.58 batches/s, loss=0.697, val_loss=0.53, val_acc=82.40, acc=56.18, lr=[0.001]]                 
Epoch 2/17:   2%|▏         | 7/297 [00:00<00:06, 43.14 batches/s, loss=0.8274, acc=74.11]

82.4

Epoch 1: improved from -inf to 82.4000 saving model to ./ckpt.pth.tar


Epoch 2/17: : 298 batches [00:08, 36.33 batches/s, loss=0.563, val_loss=0.39, val_acc=85.60, acc=80.62, lr=[0.001]]                 
Epoch 3/17:   2%|▏         | 6/297 [00:00<00:06, 43.83 batches/s, loss=0.5405, acc=85.94]

85.6

Epoch 2: improved from 82.4000 to 85.6000 saving model to ./ckpt.pth.tar


Epoch 3/17:  52%|█████▏    | 155/297 [00:04<00:03, 36.79 batches/s, loss=0.4637, acc=87.48]


KeyboardInterrupt: 

In [12]:
callbacks[1].monitor

'val_loss'

In [13]:
from torchsample.callbacks import _mode_dependent_param

In [14]:
_mode_dependent_param('auto','val_loss')

(<ufunc 'greater'>, -inf, 0)

In [None]:
if 'auto' == "auto":
    mode == 'max' if 'acc' in monitor else 'min'

In [30]:
callbacks[1].monitor_op(1,2)

False

In [48]:
evalTest = trainer.evaluate_loader(test)
evalValid = trainer.evaluate_loader(valid)
print("Validation - Loss: {}, Accuracy: {}".format(evalValid['val_loss'],evalValid['val_acc_metric']))
print("Test - Loss: {}, Accuracy: {}".format(evalTest['val_loss'],evalTest['val_acc_metric']))

Validation - Loss: 0.4417393133044243, Accuracy: 86.4
Test - Loss: 0.6335211604147776, Accuracy: 84.71052631578948


In [49]:
checkpoint = torch.load('ckpt.pth.tar')
model.load_state_dict(checkpoint["state_dict"])

In [50]:
print("valid",trainer.evaluate_loader(valid))
print("test",trainer.evaluate_loader(test))

valid {'val_loss': 0.37180668553885293, 'val_acc_metric': 87.2}
test {'val_loss': 0.5179101362606224, 'val_acc_metric': 85.34210526315789}


In [51]:
trainer = Trainer(model)

In [52]:
trainer.evaluate(test)

Test accuracy: 0.853421052631579


In [12]:
%%time

np.random.seed(args.seed)
torch.manual_seed(args.seed)

train,valid,test = train_valid_test_datasets(args.dataset,
                                              validSize=args.validation_size,
                                              isHashingTrick = not args.dictionnary,
                                              nFeaturesRange = args.num_features_range,
                                              ngramRange = args.ngrams_range,
                                              seed = args.seed,
                                              num_words = args.num_embeding,
                                              specificArgs = {'dictionnary': ['num_words']})

CPU times: user 1.91 s, sys: 20.7 ms, total: 1.93 s
Wall time: 1.93 s


In [13]:
#train, valid = train_valid_load(train,validSize=0.1,isShuffle=True,seed=123,batch_size=batchSize)
test = DataLoader(dataset=test,batch_size=args.batch_size,shuffle=not args.no_shuffle)

In [14]:
num_classes = len(train.classes)
model = ModelNoDict(args.num_embeding,
                    args.dim,
                    num_classes,
                    isHash=not args.no_hashembed,
                    num_buckets=args.num_buckets)
trainer = Trainer(model)
callbacks = [EarlyStopping(patience=args.patience)] if args.patience is not None else [None]

In [15]:
%%time
trainer(train,
        validDataset = valid,
        callbacks = callbacks,
        batch_size = args.batch_size,
        epochs = args.epochs)

Num parameters in model: 400092
Train on 297 samples, validate on 16 samples
Time since start: 0.1. Epoch: 0. Loss: 0.8240941166877747. Acc: 0.8.
Time since start: 0.2. Epoch: 1. Loss: 0.47239628434181213. Acc: 0.856.
Time since start: 0.4. Epoch: 2. Loss: 0.33095020055770874. Acc: 0.856.
Time since start: 0.5. Epoch: 3. Loss: 0.2885614335536957. Acc: 0.872.
Time since start: 0.6. Epoch: 4. Loss: 0.23835055530071259. Acc: 0.868.
Time since start: 0.7. Epoch: 5. Loss: 0.20833487808704376. Acc: 0.866.
Time since start: 0.9. Epoch: 6. Loss: 0.23387226462364197. Acc: 0.876.
CPU times: user 48.4 s, sys: 3.03 s, total: 51.4 s
Wall time: 51.4 s


In [16]:
trainer.evaluate(test)

Test accuracy: 0.8522368421052632


In [None]:
%%time
trainer(train,
        validDataset = valid,
        callbacks = callbacks,
        batch_size = args.batch_size,
        epochs = args.epochs)

Num parameters in model: 1200412
Train on 891 samples, validate on 47 samples
Epoch: 0. Loss: 0.6332538723945618. Acc: 0.904.
Epoch: 1. Loss: 0.5259655714035034. Acc: 0.906.


KeyboardInterrupt: 

In [12]:
trainer.evaluate(test)

Test accurate: 0.9014473684210527


In [17]:
from torchsample.datasets import CSVDataset

In [18]:
ls

Results.ipynb


In [19]:
CSVDataset('test.csv')

NameError: name 'sys' is not defined