Skip to content

Commit

Permalink
Merge pull request #9 from achyudh/master
Browse files Browse the repository at this point in the history
Refactor CharCNN model
  • Loading branch information
achyudh committed Mar 20, 2019
2 parents 9f50fa9 + 9677258 commit 90594b9
Show file tree
Hide file tree
Showing 14 changed files with 64 additions and 468 deletions.
2 changes: 1 addition & 1 deletion README.md
@@ -1,4 +1,4 @@
# Castor
# Hedwig

This repo contains PyTorch deep learning models for document classification, implemented by the Data Systems Group at the University of Waterloo.

Expand Down
3 changes: 1 addition & 2 deletions datasets/imdb.py
@@ -1,13 +1,12 @@
import os
import re

import numpy as np
import torch
from torchtext.data import NestedField, Field, TabularDataset
from torchtext.data.iterator import BucketIterator
from torchtext.vocab import Vectors

from datasets.reuters import clean_string, clean_string_fl, split_sents
from datasets.reuters import clean_string, split_sents


def char_quantize(string, max_length=500):
Expand Down
90 changes: 52 additions & 38 deletions models/char_cnn/__main__.py
@@ -1,18 +1,18 @@
from copy import deepcopy
import logging
import random
from copy import deepcopy

import numpy as np
import torch

from models.char_cnn import get_args
from models.char_cnn import CharCNN
from common.evaluate import EvaluatorFactory
from common.train import TrainerFactory
from datasets.aapd import AAPDCharQuantized as AAPD
from datasets.imdb import IMDBCharQuantized as IMDB
from datasets.reuters import ReutersCharQuantized as Reuters
from datasets.yelp2014 import Yelp2014CharQuantized as Yelp2014
from models.char_cnn.args import get_args
from models.char_cnn.model import CharCNN


class UnknownWordVecCache(object):
Expand All @@ -26,8 +26,6 @@ def unk(cls, tensor):
size_tup = tuple(tensor.size())
if size_tup not in cls.cache:
cls.cache[size_tup] = torch.Tensor(tensor.size())
# choose 0.25 so unknown vectors have approximately same variance as pre-trained ones
# same as original implementation: https://github.com/yoonkim/CNN_sentence/blob/0a626a048757d5272a7e8ccede256a434a6529be/process_data.py#L95
cls.cache[size_tup].uniform_(-0.25, 0.25)
return cls.cache[size_tup]

Expand All @@ -45,34 +43,38 @@ def get_logger():
return logger


def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, single_label):
def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, is_multilabel):
saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device)
saved_model_evaluator.ignore_lengths = True
saved_model_evaluator.single_label = single_label
if hasattr(saved_model_evaluator, 'is_multilabel'):
saved_model_evaluator.is_multilabel = is_multilabel
if hasattr(saved_model_evaluator, 'ignore_lengths'):
saved_model_evaluator.ignore_lengths = True

scores, metric_names = saved_model_evaluator.get_scores()
print('Evaluation metrics for', split_name)
print(metric_names)
print(scores)


if __name__ == '__main__':
# Set default configuration in : args.py
# Set default configuration in args.py
args = get_args()
logger = get_logger()

# Set random seed for reproducibility
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = True
np.random.seed(args.seed)
random.seed(args.seed)

if not args.cuda:
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
print('Note: You are using GPU for training')
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
print('Warning: You have Cuda but not use it. You are using CPU for training.')
np.random.seed(args.seed)
random.seed(args.seed)
logger = get_logger()
print('Warning: Using CPU for training')

dataset_map = {
'Reuters': Reuters,
Expand All @@ -83,20 +85,25 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si

if args.dataset not in dataset_map:
raise ValueError('Unrecognized dataset')

else:
train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file,
args.word_vectors_dir,
batch_size=args.batch_size, device=args.gpu,
unk_init=UnknownWordVecCache.unk)
dataset_class = dataset_map[args.dataset]
train_iter, dev_iter, test_iter = dataset_class.iters(args.data_dir,
args.word_vectors_file,
args.word_vectors_dir,
batch_size=args.batch_size,
device=args.gpu,
unk_init=UnknownWordVecCache.unk)

config = deepcopy(args)
config.dataset = train_iter.dataset
config.target_class = train_iter.dataset.NUM_CLASSES

print('LABEL.target_class:', train_iter.dataset.NUM_CLASSES)
print('Train instance', len(train_iter.dataset))
print('Dev instance', len(dev_iter.dataset))
print('Test instance', len(test_iter.dataset))
print('Dataset:', args.dataset)
print('No. of target classes:', train_iter.dataset.NUM_CLASSES)
print('No. of train instances', len(train_iter.dataset))
print('No. of dev instances', len(dev_iter.dataset))
print('No. of test instances', len(test_iter.dataset))

if args.resume_snapshot:
if args.cuda:
Expand All @@ -107,34 +114,36 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si
model = CharCNN(config)
if args.cuda:
model.cuda()
print('Shift model to GPU')

parameter = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameter, lr=args.lr, weight_decay=args.weight_decay)

if args.dataset not in dataset_map:
raise ValueError('Unrecognized dataset')
else:
train_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, train_iter, args.batch_size, args.gpu)
test_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, test_iter, args.batch_size, args.gpu)
dev_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, dev_iter, args.batch_size, args.gpu)
train_evaluator.single_label = args.single_label
test_evaluator.single_label = args.single_label
dev_evaluator.single_label = args.single_label
train_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, train_iter, args.batch_size, args.gpu)
test_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, test_iter, args.batch_size, args.gpu)
dev_evaluator = EvaluatorFactory.get_evaluator(dataset_class, model, None, dev_iter, args.batch_size, args.gpu)

if hasattr(train_evaluator, 'is_multilabel'):
train_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(dev_evaluator, 'is_multilabel'):
dev_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(dev_evaluator, 'ignore_lengths'):
dev_evaluator.ignore_lengths = True
if hasattr(test_evaluator, 'is_multilabel'):
test_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(test_evaluator, 'ignore_lengths'):
test_evaluator.ignore_lengths = True

trainer_config = {
'optimizer': optimizer,
'batch_size': args.batch_size,
'log_interval': args.log_every,
'dev_log_interval': args.dev_every,
'patience': args.patience,
'model_outfile': args.save_path, # actually a directory, using model_outfile to conform to Trainer naming convention
'model_outfile': args.save_path,
'logger': logger,
'ignore_lengths': True,
'single_label': args.single_label
'is_multilabel': dataset_class.IS_MULTILABEL,
'ignore_lengths': True
}

trainer = TrainerFactory.get_trainer(args.dataset, model, None, train_iter, trainer_config, train_evaluator, test_evaluator, dev_evaluator)

if not args.trained_model:
Expand All @@ -145,10 +154,15 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si
else:
model = torch.load(args.trained_model, map_location=lambda storage, location: storage)

# Calculate dev and test metrics
model = torch.load(trainer.snapshot_path)

# Calculate dev and test metrics
if args.dataset not in dataset_map:
raise ValueError('Unrecognized dataset')
else:
evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size, args.gpu, args.single_label)
evaluate_dataset('test', dataset_map[args.dataset], model, None, test_iter, args.batch_size, args.gpu, args.single_label)
evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size,
is_multilabel=dataset_class.IS_MULTILABEL,
device=args.gpu)
evaluate_dataset('test', dataset_map[args.dataset], model, None, test_iter, args.batch_size,
is_multilabel=dataset_class.IS_MULTILABEL,
device=args.gpu)
4 changes: 4 additions & 0 deletions models/char_cnn/model.py
Expand Up @@ -9,6 +9,7 @@ class CharCNN(nn.Module):
def __init__(self, config):
super().__init__()
self.is_cuda_enabled = config.cuda

num_conv_filters = config.num_conv_filters
output_channel = config.output_channel
num_affine_neurons = config.num_affine_neurons
Expand All @@ -21,6 +22,7 @@ def __init__(self, config):
self.conv4 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3)
self.conv5 = nn.Conv1d(num_conv_filters, num_conv_filters, kernel_size=3)
self.conv6 = nn.Conv1d(num_conv_filters, output_channel, kernel_size=3)

self.dropout = nn.Dropout(config.dropout)
self.fc1 = nn.Linear(output_channel, num_affine_neurons)
self.fc2 = nn.Linear(num_affine_neurons, num_affine_neurons)
Expand All @@ -31,12 +33,14 @@ def forward(self, x, **kwargs):
x = x.transpose(1, 2).type(torch.cuda.FloatTensor)
else:
x = x.transpose(1, 2).type(torch.FloatTensor)

x = F.max_pool1d(F.relu(self.conv1(x)), 3)
x = F.max_pool1d(F.relu(self.conv2(x)), 3)
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = F.relu(self.conv5(x))
x = F.relu(self.conv6(x))

x = F.max_pool1d(x, x.size(2)).squeeze(2)
x = F.relu(self.fc1(x.view(x.size(0), -1)))
x = self.dropout(x)
Expand Down
23 changes: 0 additions & 23 deletions models/conv_rnn/README.md

This file was deleted.

Empty file removed models/conv_rnn/__init__.py
Empty file.
59 changes: 0 additions & 59 deletions models/conv_rnn/data.py

This file was deleted.

7 changes: 0 additions & 7 deletions models/conv_rnn/getData.sh

This file was deleted.

0 comments on commit 90594b9

Please sign in to comment.