From b5190a5dc227273f55e0a62d52ded4d82e16f9d9 Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 16:25:33 -0400 Subject: [PATCH 1/7] Fix project title in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1623169..e7048f6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Castor +# Hedwig This repo contains PyTorch deep learning models for document classification, implemented by the Data Systems Group at the University of Waterloo. From 5629414bd7ed8101b45461d3fd0e3394c1c85962 Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 23:08:39 -0400 Subject: [PATCH 2/7] Remove conv_rnn model --- models/conv_rnn/README.md | 23 ----- models/conv_rnn/__init__.py | 0 models/conv_rnn/data.py | 59 ------------- models/conv_rnn/getData.sh | 7 -- models/conv_rnn/model.py | 133 ----------------------------- models/conv_rnn/test.py | 36 -------- models/conv_rnn/train.py | 166 ------------------------------------ 7 files changed, 424 deletions(-) delete mode 100644 models/conv_rnn/README.md delete mode 100644 models/conv_rnn/__init__.py delete mode 100644 models/conv_rnn/data.py delete mode 100755 models/conv_rnn/getData.sh delete mode 100644 models/conv_rnn/model.py delete mode 100644 models/conv_rnn/test.py delete mode 100644 models/conv_rnn/train.py diff --git a/models/conv_rnn/README.md b/models/conv_rnn/README.md deleted file mode 100644 index b86a92a..0000000 --- a/models/conv_rnn/README.md +++ /dev/null @@ -1,23 +0,0 @@ -## Convolutional RNN - -Implementation based on [[1]](http://dl.acm.org/citation.cfm?id=3098140). - -### Usage - -Run `./getData.sh` to fetch the data. The project structure should now look like this: - -``` -├── conv_rnn/ -│ ├── data/ -│ ├── saves/ -│ └── *.* -``` -You may then run `python train.py` and `python test.py` for training and testing, respectively. For more options, add the `-h` switch. - -### Empirical results -Best dev | Test --- | -- -48.1 | 48.9 - -### References -[1] Chenglong Wang, Feijun Jiang, and Hongxia Yang. 2017. A Hybrid Framework for Text Modeling with Convolutional RNN. In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD '17). diff --git a/models/conv_rnn/__init__.py b/models/conv_rnn/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/models/conv_rnn/data.py b/models/conv_rnn/data.py deleted file mode 100644 index ef33ddd..0000000 --- a/models/conv_rnn/data.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import re - -import numpy as np -import torch.utils.data as data - - -def sst_tokenize(sentence): - return sentence.split() - - -class SSTEmbeddingLoader(object): - - def __init__(self, dirname, fmt="stsa.fine.{}", word2vec_file="word2vec.sst-1"): - self.dirname = dirname - self.fmt = fmt - self.word2vec_file = word2vec_file - - def load_embed_data(self): - weights = [] - id_dict = {} - unk_vocab_set = set() - with open(os.path.join(self.dirname, self.word2vec_file)) as f: - for i, line in enumerate(f.readlines()): - word, vec = line.replace("\n", "").split(" ", 1) - vec = np.array([float(v) for v in vec.split(" ")]) - weights.append(vec) - id_dict[word] = i - with open(os.path.join(self.dirname, self.fmt.format("phrases.train"))) as f: - for line in f.readlines(): - for word in sst_tokenize(line): - if word not in id_dict and word not in unk_vocab_set: - unk_vocab_set.add(word) - return (id_dict, np.array(weights), list(unk_vocab_set)) - - -class SSTDataset(data.Dataset): - - def __init__(self, sentences): - super().__init__() - self.sentences = sentences - - def __len__(self): - return len(self.sentences) - - def __getitem__(self, index): - return self.sentences[index] - - @classmethod - def load_sst_sets(cls, dirname, fmt="stsa.fine.{}"): - set_names = ["phrases.train", "dev", "test"] - def read_set(name): - data_set = [] - with open(os.path.join(dirname, fmt.format(name))) as f: - for line in f.readlines(): - sentiment, sentence = line.replace("\n", "").split(" ", 1) - data_set.append((sentiment, sentence)) - return np.array(data_set) - return [cls(read_set(name)) for name in set_names] diff --git a/models/conv_rnn/getData.sh b/models/conv_rnn/getData.sh deleted file mode 100755 index 021afc1..0000000 --- a/models/conv_rnn/getData.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh -mkdir -p data -mkdir -p saves -wget http://ocp59jkku.bkt.clouddn.com/sst-1.zip -P data/ -wget http://ocp59jkku.bkt.clouddn.com/sst-2.zip -P data/ -unzip data/sst-1.zip -d data/ -unzip data/sst-2.zip -d data/ diff --git a/models/conv_rnn/model.py b/models/conv_rnn/model.py deleted file mode 100644 index c4bccea..0000000 --- a/models/conv_rnn/model.py +++ /dev/null @@ -1,133 +0,0 @@ -import random - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.nn.utils.rnn as rnn_utils - -import data - - -class ConvRNNModel(nn.Module): - - def __init__(self, word_model, **config): - super().__init__() - embedding_dim = word_model.dim - self.word_model = word_model - self.hidden_size = config["hidden_size"] - fc_size = config["fc_size"] - self.batch_size = config["mbatch_size"] - n_fmaps = config["n_feature_maps"] - self.rnn_type = config["rnn_type"] - self.no_cuda = config["no_cuda"] - - if self.rnn_type.upper() == "LSTM": - self.bi_rnn = nn.LSTM(embedding_dim, self.hidden_size, 1, batch_first=True, bidirectional=True) - elif self.rnn_type.upper() == "GRU": - self.bi_rnn = nn.GRU(embedding_dim, self.hidden_size, 1, batch_first=True, bidirectional=True) - else: - raise ValueError("RNN type must be one of LSTM or GRU") - self.conv = nn.Conv2d(1, n_fmaps, (1, self.hidden_size * 2)) - self.fc1 = nn.Linear(n_fmaps + 2 * self.hidden_size, fc_size) - self.fc2 = nn.Linear(fc_size, config["n_labels"]) - - def convert_dataset(self, dataset): - dataset = np.stack(dataset) - model_in = dataset[:, 1].reshape(-1) - model_out = dataset[:, 0].flatten().astype(np.int) - model_out = torch.from_numpy(model_out) - indices, lengths = self.preprocess(model_in) - if not self.no_cuda: - model_out = model_out.cuda() - indices = indices.cuda() - lengths = lengths.cuda() - lengths, sort_idx = torch.sort(lengths, descending=True) - indices = indices[sort_idx] - model_out = model_out[sort_idx] - return ((indices, lengths), model_out) - - def preprocess(self, sentences): - indices, lengths = self.word_model.lookup(sentences) - return torch.LongTensor(indices), torch.LongTensor(lengths) - - def forward(self, x, lengths): - x = self.word_model(x) - x = rnn_utils.pack_padded_sequence(x, lengths, batch_first=True) - rnn_seq, rnn_out = self.bi_rnn(x) - if self.rnn_type.upper() == "LSTM": - rnn_out = rnn_out[0] - - rnn_seq, _ = rnn_utils.pad_packed_sequence(rnn_seq, batch_first=True) - rnn_out.data = rnn_out.data.permute(1, 0, 2) - x = self.conv(rnn_seq.unsqueeze(1)).squeeze(3) - x = F.relu(x) - x = F.max_pool1d(x, x.size(2)) - out = [t.squeeze(1) for t in rnn_out.chunk(2, 1)] - out.append(x.squeeze(-1)) - x = torch.cat(out, 1) - x = F.relu(self.fc1(x)) - return self.fc2(x) - - -class WordEmbeddingModel(nn.Module): - def __init__(self, id_dict, weights, unknown_vocab=[], static=True, padding_idx=0): - super().__init__() - vocab_size = len(id_dict) + len(unknown_vocab) - self.lookup_table = id_dict - last_id = max(id_dict.values()) - for word in unknown_vocab: - last_id += 1 - self.lookup_table[word] = last_id - self.dim = weights.shape[1] - self.weights = np.concatenate((weights, np.random.rand(len(unknown_vocab), self.dim) / 2 - 0.25)) - self.padding_idx = padding_idx - self.embedding = nn.Embedding(vocab_size, self.dim, padding_idx=padding_idx) - self.embedding.weight.data.copy_(torch.from_numpy(self.weights)) - if static: - self.embedding.weight.requires_grad = False - - @classmethod - def make_random_model(cls, id_dict, unknown_vocab=[], dim=300): - weights = np.random.rand(len(id_dict), dim) - 0.5 - return cls(id_dict, weights, unknown_vocab, static=False) - - def forward(self, x): - return self.embedding(x) - - def lookup(self, sentences): - raise NotImplementedError - - -class SSTWordEmbeddingModel(WordEmbeddingModel): - - def __init__(self, id_dict, weights, unknown_vocab=[]): - super().__init__(id_dict, weights, unknown_vocab, padding_idx=16259) - - def lookup(self, sentences): - indices_list = [] - max_len = 0 - for sentence in sentences: - indices = [] - for word in data.sst_tokenize(sentence): - try: - index = self.lookup_table[word] - indices.append(index) - except KeyError: - continue - indices_list.append(indices) - if len(indices) > max_len: - max_len = len(indices) - lengths = [len(x) for x in indices_list] - for indices in indices_list: - indices.extend([self.padding_idx] * (max_len - len(indices))) - return indices_list, lengths - - -def set_seed(seed=0, no_cuda=False): - np.random.seed(seed) - if not no_cuda: - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - torch.manual_seed(seed) - random.seed(seed) diff --git a/models/conv_rnn/test.py b/models/conv_rnn/test.py deleted file mode 100644 index 1fbb7a7..0000000 --- a/models/conv_rnn/test.py +++ /dev/null @@ -1,36 +0,0 @@ -import argparse -import os -import random - -import numpy as np -import torch -import torch.nn as nn -import torch.utils as utils - -import data -import model - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--no_cuda", action="store_true", default=False) - parser.add_argument("--input_file", default="saves/model.pt", type=str) - parser.add_argument("--data_dir", default="data", type=str) - parser.add_argument("--gpu_number", default=0, type=int) - args = parser.parse_args() - - conv_rnn = torch.load(args.input_file) - if not args.no_cuda: - torch.cuda.set_device(args.gpu_number) - conv_rnn.cuda() - _, _, test_set = data.SSTDataset.load_sst_sets("data") - test_loader = utils.data.DataLoader(test_set, batch_size=len(test_set), collate_fn=conv_rnn.convert_dataset) - - conv_rnn.eval() - for test_in, test_out in test_loader: - scores = conv_rnn(*test_in) - n_correct = (torch.max(scores, 1)[1].view(-1).data == test_out.data).float().sum() - accuracy = n_correct / len(test_set) - print("Test set accuracy: {}".format(accuracy)) - -if __name__ == "__main__": - main() diff --git a/models/conv_rnn/train.py b/models/conv_rnn/train.py deleted file mode 100644 index 06a820d..0000000 --- a/models/conv_rnn/train.py +++ /dev/null @@ -1,166 +0,0 @@ -import argparse -import os -import random - -from torch import utils -from tqdm import tqdm -import numpy as np -import torch -import torch.nn as nn - -import data -import model - -class RandomSearch(object): - def __init__(self, params): - self.params = params - - def __iter__(self): - param_space = list(GridSearch(self.params)) - random.shuffle(param_space) - for param in param_space: - yield param - -class GridSearch(object): - def __init__(self, params): - self.params = params - self.param_lengths = [len(param) for param in self.params] - self.indices = [1] * len(params) - - def _update(self, carry_idx): - if carry_idx >= len(self.params): - return True - if self.indices[carry_idx] < self.param_lengths[carry_idx]: - self.indices[carry_idx] += 1 - return False - else: - self.indices[carry_idx] = 1 - return False or self._update(carry_idx + 1) - - def __iter__(self): - self.stop_next = False - self.indices = [1] * len(self.params) - return self - - def __next__(self): - if self.stop_next: - raise StopIteration - result = [param[idx - 1] for param, idx in zip(self.params, self.indices)] - self.indices[0] += 1 - if self.indices[0] == self.param_lengths[0] + 1: - self.indices[0] = 1 - self.stop_next = self._update(1) - return result - -def train(**kwargs): - mbatch_size = kwargs["mbatch_size"] - n_epochs = kwargs["n_epochs"] - restore = kwargs["restore"] - verbose = not kwargs["quiet"] - lr = kwargs["lr"] - weight_decay = kwargs["weight_decay"] - seed = kwargs["seed"] - - if not kwargs["no_cuda"]: - torch.cuda.set_device(kwargs["gpu_number"]) - model.set_seed(seed) - embed_loader = data.SSTEmbeddingLoader("data") - if restore: - conv_rnn = torch.load(kwargs["input_file"]) - else: - id_dict, weights, unk_vocab_list = embed_loader.load_embed_data() - word_model = model.SSTWordEmbeddingModel(id_dict, weights, unk_vocab_list) - if not kwargs["no_cuda"]: - word_model.cuda() - conv_rnn = model.ConvRNNModel(word_model, **kwargs) - if not kwargs["no_cuda"]: - conv_rnn.cuda() - - conv_rnn.train() - criterion = nn.CrossEntropyLoss() - parameters = list(filter(lambda p: p.requires_grad, conv_rnn.parameters())) - optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=weight_decay) - train_set, dev_set, test_set = data.SSTDataset.load_sst_sets("data") - - collate_fn = conv_rnn.convert_dataset - train_loader = utils.data.DataLoader(train_set, shuffle=True, batch_size=mbatch_size, collate_fn=collate_fn) - dev_loader = utils.data.DataLoader(dev_set, batch_size=len(dev_set), collate_fn=collate_fn) - test_loader = utils.data.DataLoader(test_set, batch_size=len(test_set), collate_fn=collate_fn) - - def evaluate(loader, dev=True): - conv_rnn.eval() - for m_in, m_out in loader: - scores = conv_rnn(*m_in) - loss = criterion(scores, m_out).item() - n_correct = (torch.max(scores, 1)[1].view(m_in[0].size(0)).data == m_out.data).float().sum().item() - accuracy = n_correct / m_in[0].size(0) - if dev and accuracy >= evaluate.best_dev: - evaluate.best_dev = accuracy - print("Saving best model ({})...".format(accuracy)) - torch.save(conv_rnn, kwargs["output_file"]) - if verbose: - print("{} set accuracy: {}, loss: {}".format("dev" if dev else "test", accuracy, loss)) - conv_rnn.train() - evaluate.best_dev = 0 - - for epoch in range(n_epochs): - print("Epoch number: {}".format(epoch), end="\r") - if verbose: - print() - i = 0 - for (j, (train_in, train_out)), _ in zip(enumerate(train_loader), tqdm(range(len(train_loader)))): - optimizer.zero_grad() - scores = conv_rnn(*train_in) - loss = criterion(scores, train_out) - loss.backward() - optimizer.step() - evaluate(dev_loader) - evaluate(test_loader, dev=False) - return evaluate.best_dev - -def do_random_search(given_params): - test_grid = [[0.15, 0.2], [4, 5, 6], [150, 200], [3, 4, 5], [200, 300], [200, 250]] - max_params = None - max_acc = 0. - for args in RandomSearch(test_grid): - sf, gc, hid, seed, fc_size, fmaps = args - print("Testing {}".format(args)) - given_params.update(dict(n_epochs=7, quiet=True, gradient_clip=gc, hidden_Size=hid, seed=seed, - n_feature_maps=fmaps, fc_size=fc_size)) - dev_acc = train(**given_params) - print("Dev accuracy: {}".format(dev_acc)) - if dev_acc > max_acc: - print("Found current max") - max_acc = dev_acc - max_params = args - print("Best params: {}".format(max_params)) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--dev_per_epoch", default=9, type=int) - parser.add_argument("--fc_size", default=200, type=int) - parser.add_argument("--gpu_number", default=0, type=int) - parser.add_argument("--hidden_size", default=150, type=int) - parser.add_argument("--input_file", default="saves/model.pt", type=str) - parser.add_argument("--lr", default=5E-4, type=float) - parser.add_argument("--mbatch_size", default=16, type=int) - parser.add_argument("--n_epochs", default=30, type=int) - parser.add_argument("--n_feature_maps", default=200, type=float) - parser.add_argument("--n_labels", default=5, type=int) - parser.add_argument("--no_cuda", action="store_true", default=False) - parser.add_argument("--output_file", default="saves/model.pt", type=str) - parser.add_argument("--random_search", action="store_true", default=False) - parser.add_argument("--restore", action="store_true", default=False) - parser.add_argument("--rnn_type", choices=["lstm", "gru"], default="lstm", type=str) - parser.add_argument("--seed", default=3, type=int) - parser.add_argument("--quiet", action="store_true", default=False) - parser.add_argument("--weight_decay", default=1E-3, type=float) - args = parser.parse_args() - if args.random_search: - do_random_search(vars(args)) - return - train(**vars(args)) - -if __name__ == "__main__": - main() - From 04e1138f1954ef2ea4372011e1e05d2678335eda Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 23:08:56 -0400 Subject: [PATCH 3/7] Fix default dataset in KimCNN --- models/kim_cnn/args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/kim_cnn/args.py b/models/kim_cnn/args.py index 81094cb..a5c5ac1 100644 --- a/models/kim_cnn/args.py +++ b/models/kim_cnn/args.py @@ -6,7 +6,7 @@ def get_args(): parser = models.args.get_args() - parser.add_argument('--dataset', type=str, default='SST-1', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) parser.add_argument('--mode', type=str, default='multichannel', choices=['rand', 'static', 'non-static', 'multichannel']) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) From 256bb69aa3c675b2349a7329a8d5e30bb64bef1c Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 23:10:09 -0400 Subject: [PATCH 4/7] Add attribute check for is_multilabel --- models/reg_lstm/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index c2c5b0e..264906f 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -45,7 +45,9 @@ def get_logger(): def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, is_multilabel): saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device) - saved_model_evaluator.is_multilabel = is_multilabel + if hasattr(saved_model_evaluator, 'is_multilabel'): + saved_model_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + scores, metric_names = saved_model_evaluator.get_scores() print('Evaluation metrics for', split_name) print(metric_names) @@ -53,7 +55,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': - # Set default configuration in : args.py + # Set default configuration in args.py args = get_args() logger = get_logger() From 533510b5a79139174d24cacc6d382ec04882f41d Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 23:10:58 -0400 Subject: [PATCH 5/7] Refactor char_cnn driver method --- models/char_cnn/__main__.py | 79 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 2fbf171..3010cc5 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -1,18 +1,18 @@ -from copy import deepcopy import logging import random +from copy import deepcopy import numpy as np import torch -from models.char_cnn import get_args -from models.char_cnn import CharCNN from common.evaluate import EvaluatorFactory from common.train import TrainerFactory from datasets.aapd import AAPDCharQuantized as AAPD from datasets.imdb import IMDBCharQuantized as IMDB from datasets.reuters import ReutersCharQuantized as Reuters from datasets.yelp2014 import Yelp2014CharQuantized as Yelp2014 +from models.char_cnn.args import get_args +from models.char_cnn.model import CharCNN class UnknownWordVecCache(object): @@ -26,8 +26,6 @@ def unk(cls, tensor): size_tup = tuple(tensor.size()) if size_tup not in cls.cache: cls.cache[size_tup] = torch.Tensor(tensor.size()) - # choose 0.25 so unknown vectors have approximately same variance as pre-trained ones - # same as original implementation: https://github.com/yoonkim/CNN_sentence/blob/0a626a048757d5272a7e8ccede256a434a6529be/process_data.py#L95 cls.cache[size_tup].uniform_(-0.25, 0.25) return cls.cache[size_tup] @@ -47,7 +45,11 @@ def get_logger(): def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, single_label): saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device) - saved_model_evaluator.ignore_lengths = True + if hasattr(saved_model_evaluator, 'is_multilabel'): + saved_model_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + if hasattr(saved_model_evaluator, 'ignore_lengths'): + saved_model_evaluator.ignore_lengths = True + saved_model_evaluator.single_label = single_label scores, metric_names = saved_model_evaluator.get_scores() print('Evaluation metrics for', split_name) @@ -56,12 +58,16 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': - # Set default configuration in : args.py + # Set default configuration in args.py args = get_args() + logger = get_logger() # Set random seed for reproducibility torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True + np.random.seed(args.seed) + random.seed(args.seed) + if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: @@ -69,10 +75,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: - print('Warning: You have Cuda but not use it. You are using CPU for training.') - np.random.seed(args.seed) - random.seed(args.seed) - logger = get_logger() + print('Warning: Using CPU for training') dataset_map = { 'Reuters': Reuters, @@ -83,20 +86,25 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') + else: - train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file, - args.word_vectors_dir, - batch_size=args.batch_size, device=args.gpu, - unk_init=UnknownWordVecCache.unk) + dataset_class = dataset_map[args.dataset] + train_iter, dev_iter, test_iter = dataset_class.iters(args.data_dir, + args.word_vectors_file, + args.word_vectors_dir, + batch_size=args.batch_size, + device=args.gpu, + unk_init=UnknownWordVecCache.unk) config = deepcopy(args) config.dataset = train_iter.dataset config.target_class = train_iter.dataset.NUM_CLASSES - print('LABEL.target_class:', train_iter.dataset.NUM_CLASSES) - print('Train instance', len(train_iter.dataset)) - print('Dev instance', len(dev_iter.dataset)) - print('Test instance', len(test_iter.dataset)) + print('Dataset:', args.dataset) + print('No. of target classes:', train_iter.dataset.NUM_CLASSES) + print('No. of train instances', len(train_iter.dataset)) + print('No. of dev instances', len(dev_iter.dataset)) + print('No. of test instances', len(test_iter.dataset)) if args.resume_snapshot: if args.cuda: @@ -107,34 +115,36 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si model = CharCNN(config) if args.cuda: model.cuda() - print('Shift model to GPU') parameter = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameter, lr=args.lr, weight_decay=args.weight_decay) - if args.dataset not in dataset_map: - raise ValueError('Unrecognized dataset') - else: - train_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, train_iter, args.batch_size, args.gpu) - test_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, test_iter, args.batch_size, args.gpu) - dev_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, dev_iter, args.batch_size, args.gpu) - train_evaluator.single_label = args.single_label - test_evaluator.single_label = args.single_label - dev_evaluator.single_label = args.single_label + train_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, train_iter, args.batch_size, args.gpu) + test_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, test_iter, args.batch_size, args.gpu) + dev_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, dev_iter, args.batch_size, args.gpu) + + if hasattr(train_evaluator, 'is_multilabel'): + train_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + if hasattr(dev_evaluator, 'is_multilabel'): + dev_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + if hasattr(dev_evaluator, 'ignore_lengths'): dev_evaluator.ignore_lengths = True + if hasattr(test_evaluator, 'is_multilabel'): + test_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + if hasattr(test_evaluator, 'ignore_lengths'): test_evaluator.ignore_lengths = True trainer_config = { 'optimizer': optimizer, 'batch_size': args.batch_size, 'log_interval': args.log_every, - 'dev_log_interval': args.dev_every, 'patience': args.patience, - 'model_outfile': args.save_path, # actually a directory, using model_outfile to conform to Trainer naming convention + 'model_outfile': args.save_path, 'logger': logger, - 'ignore_lengths': True, - 'single_label': args.single_label + 'is_multilabel': dataset_class.IS_MULTILABEL, + 'ignore_lengths': True } + trainer = TrainerFactory.get_trainer(args.dataset, model, None, train_iter, trainer_config, train_evaluator, test_evaluator, dev_evaluator) if not args.trained_model: @@ -145,8 +155,9 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = torch.load(args.trained_model, map_location=lambda storage, location: storage) - # Calculate dev and test metrics model = torch.load(trainer.snapshot_path) + + # Calculate dev and test metrics if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: From ce6c5e99232fc547725180f2bd518a8d1ba47d65 Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Tue, 19 Mar 2019 23:41:34 -0400 Subject: [PATCH 6/7] Fix spacing issues --- models/char_cnn/model.py | 4 ++++ models/reg_lstm/model.py | 1 + 2 files changed, 5 insertions(+) diff --git a/models/char_cnn/model.py b/models/char_cnn/model.py index 1873a58..4f7bdc2 100644 --- a/models/char_cnn/model.py +++ b/models/char_cnn/model.py @@ -9,6 +9,7 @@ class CharCNN(nn.Module): def __init__(self, config): super().__init__() self.is_cuda_enabled = config.cuda + num_conv_filters = config.num_conv_filters output_channel = config.output_channel num_affine_neurons = config.num_affine_neurons @@ -21,6 +22,7 @@ def __init__(self, config): self.conv4 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3) self.conv5 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3) self.conv6 = nn.Conv1d(num_conv_filters, output_channel, kernel_size=3) + self.dropout = nn.Dropout(config.dropout) self.fc1 = nn.Linear(output_channel, num_affine_neurons) self.fc2 = nn.Linear(num_affine_neurons, num_affine_neurons) @@ -31,12 +33,14 @@ def forward(self, x, **kwargs): x = x.transpose(1, 2).type(torch.cuda.FloatTensor) else: x = x.transpose(1, 2).type(torch.FloatTensor) + x = F.max_pool1d(F.relu(self.conv1(x)), 3) x = F.max_pool1d(F.relu(self.conv2(x)), 3) x = F.relu(self.conv3(x)) x = F.relu(self.conv4(x)) x = F.relu(self.conv5(x)) x = F.relu(self.conv6(x)) + x = F.max_pool1d(x, x.size(2)).squeeze(2) x = F.relu(self.fc1(x.view(x.size(0), -1))) x = self.dropout(x) diff --git a/models/reg_lstm/model.py b/models/reg_lstm/model.py index b1d7fa2..ac955f6 100644 --- a/models/reg_lstm/model.py +++ b/models/reg_lstm/model.py @@ -40,6 +40,7 @@ def __init__(self, config): if self.wdrop: self.lstm = WeightDrop(self.lstm, ['weight_hh_l0'], dropout=self.wdrop) self.dropout = nn.Dropout(config.dropout) + if self.has_bottleneck_layer: if self.is_bidirectional: self.fc1 = nn.Linear(2 * config.hidden_dim, config.hidden_dim) # Hidden Bottleneck Layer From 987fb2475565efdf97555e5701681f841242a18a Mon Sep 17 00:00:00 2001 From: Achyudh Ram Date: Wed, 20 Mar 2019 00:18:46 -0400 Subject: [PATCH 7/7] Squash minor issues with driver methods --- datasets/imdb.py | 3 +-- models/char_cnn/__main__.py | 13 ++++++++----- models/reg_lstm/__main__.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/datasets/imdb.py b/datasets/imdb.py index b50410a..5cd60db 100644 --- a/datasets/imdb.py +++ b/datasets/imdb.py @@ -1,5 +1,4 @@ import os -import re import numpy as np import torch @@ -7,7 +6,7 @@ from torchtext.data.iterator import BucketIterator from torchtext.vocab import Vectors -from datasets.reuters import clean_string, clean_string_fl, split_sents +from datasets.reuters import clean_string, split_sents def char_quantize(string, max_length=500): diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 3010cc5..0ff1fe2 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -43,14 +43,13 @@ def get_logger(): return logger -def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, single_label): +def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, is_multilabel): saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device) if hasattr(saved_model_evaluator, 'is_multilabel'): - saved_model_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + saved_model_evaluator.is_multilabel = is_multilabel if hasattr(saved_model_evaluator, 'ignore_lengths'): saved_model_evaluator.ignore_lengths = True - saved_model_evaluator.single_label = single_label scores, metric_names = saved_model_evaluator.get_scores() print('Evaluation metrics for', split_name) print(metric_names) @@ -161,5 +160,9 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: - evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size, args.gpu, args.single_label) - evaluate_dataset('test', dataset_map[args.dataset], model, None, test_iter, args.batch_size, args.gpu, args.single_label) + evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size, + is_multilabel=dataset_class.IS_MULTILABEL, + device=args.gpu) + evaluate_dataset('test', dataset_map[args.dataset], model, None, test_iter, args.batch_size, + is_multilabel=dataset_class.IS_MULTILABEL, + device=args.gpu) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index 264906f..ecc37f6 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -46,7 +46,7 @@ def get_logger(): def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, is_multilabel): saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device) if hasattr(saved_model_evaluator, 'is_multilabel'): - saved_model_evaluator.is_multilabel = dataset_class.IS_MULTILABEL + saved_model_evaluator.is_multilabel = is_multilabel scores, metric_names = saved_model_evaluator.get_scores() print('Evaluation metrics for', split_name)