Skip to content

Commit

Permalink
Merge remote-tracking branch 'vdpwi/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
daemon committed May 23, 2018
2 parents cc53e60 + 73823fc commit bb5cf71
Show file tree
Hide file tree
Showing 10 changed files with 583 additions and 0 deletions.
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 Ralph Tang

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
8 changes: 8 additions & 0 deletions vdpwi.sublime-project
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"folders":
[
{
"path": "vdpwi"
}
]
}
139 changes: 139 additions & 0 deletions vdpwi/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from collections import namedtuple

from tqdm import tqdm
import numpy as np
import scipy.stats as stats
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils as utils

from utils.log import LogWriter
import data
import model as mod

Context = namedtuple("Context", "model, train_loader, dev_loader, test_loader, optimizer, criterion, params, log_writer")
EvaluateResult = namedtuple("EvaluateResult", "pearsonr, spearmanr")

def create_context(config):
def collate_fn(batch):
emb1 = []
emb2 = []
labels = []
cmp_labels = []
pad_cube = []
max_len1 = 0; max_len2 = 0

for s1, s2, l, cl in batch:
emb1.append(s1)
emb2.append(s2)
max_len1 = max(max_len1, len(s1))
max_len2 = max(max_len2, len(s2))
labels.append(l)
cmp_labels.append(cl)

for s1, s2 in zip(emb1, emb2):
pad1 = (max_len1 - len(s1))
pad2 = (max_len2 - len(s2))
pad_mask = np.ones((max_len1, max_len2))
pad_mask[:len(s1), :len(s2)] = 0
pad_cube.append(pad_mask)
s1.extend([embedding.weight.size(0) - 1] * pad1)
s2.extend([embedding.weight.size(0) - 1] * pad2)

pad_cube = np.array(pad_cube)
emb1 = torch.LongTensor(emb1)
emb2 = torch.LongTensor(emb2)
labels = torch.Tensor(labels)
emb1 = torch.autograd.Variable(emb1, requires_grad=False)
emb2 = torch.autograd.Variable(emb2, requires_grad=False)
labels = torch.autograd.Variable(labels, requires_grad=False)
pad_cube = torch.autograd.Variable(torch.from_numpy(pad_cube).float(), requires_grad=False)
if not config.cpu:
emb1 = emb1.cuda()
emb2 = emb2.cuda()
labels = labels.cuda()
pad_cube = pad_cube.cuda()
return emb1, emb2, labels, pad_cube, cmp_labels

embedding, (train_set, dev_set, test_set) = data.load_dataset(config.dataset)
model = mod.VDPWIModel(embedding, config)
if config.restore:
model.load(config.input_file)
if not config.cpu:
model = model.cuda()

train_loader = utils.data.DataLoader(train_set, shuffle=True, batch_size=config.mbatch_size, collate_fn=collate_fn)
dev_loader = utils.data.DataLoader(dev_set, batch_size=1, collate_fn=collate_fn)
test_loader = utils.data.DataLoader(test_set, batch_size=1, collate_fn=collate_fn)

params = list(filter(lambda x: x.requires_grad, model.parameters()))
if config.optimizer == "adam":
optimizer = optim.Adam(params, lr=config.lr, weight_decay=config.weight_decay)
elif config.optimizer == "sgd":
optimizer = optim.SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay)
elif config.optimizer == "rmsprop":
optimizer = optim.RMSprop(params, lr=config.lr, alpha=config.decay, momentum=config.momentum, weight_decay=config.weight_decay)
criterion = nn.KLDivLoss()
log_writer = LogWriter()
return Context(model, train_loader, dev_loader, test_loader, optimizer, criterion, params, log_writer)

def test(config):
context = create_context(config)
result = evaluate(context, context.test_loader)
print("Final test result: {}".format(result))

def evaluate(context, data_loader):
model = context.model
model.eval()
predictions = []
true_labels = []
for sent1, sent2, _, pad_cube, truth in data_loader:
scores = model(sent1, sent2, pad_cube)
scores = F.softmax(scores).cpu().data.numpy()[0]
prediction = np.dot(np.arange(1, len(scores) + 1), scores)
predictions.append(prediction); true_labels.append(truth[0][0])

pearsonr = stats.pearsonr(predictions, true_labels)[0]
spearmanr = stats.spearmanr(predictions, true_labels)[0]
context.log_writer.log_dev_metrics(pearsonr, spearmanr)
return EvaluateResult(pearsonr, spearmanr)

def train(config):
context = create_context(config)
context.log_writer.log_hyperparams()
best_dev_pr = 0
for epoch_no in range(config.n_epochs):
print("Epoch number: {}".format(epoch_no + 1))
loader_wrapper = tqdm(context.train_loader, total=len(context.train_loader), desc="Loss")
context.model.train()
loss = 0
for sent1, sent2, label_pmf, pad_cube, _ in loader_wrapper:
context.optimizer.zero_grad()
scores = F.log_softmax(context.model(sent1, sent2, pad_cube))

loss = context.criterion(scores, label_pmf)
loss.backward()
nn.utils.clip_grad_norm(context.params, config.clip_norm)
context.optimizer.step()

loss = loss.cpu().data[0]
loader_wrapper.set_description("Loss: {:<8}".format(round(loss, 5)))
context.log_writer.log_train_loss(loss)
result = evaluate(context, context.dev_loader)
print("Dev result: {}".format(result))
if best_dev_pr < result.pearsonr:
best_dev_pr = result.pearsonr
print("Saving best model...")
context.model.save(config.output_file)

def main():
config = data.Configs.base_config()
if config.mode == "train":
train(config)
elif config.mode == "test":
test(config)

if __name__ == "__main__":
main()
104 changes: 104 additions & 0 deletions vdpwi/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import argparse
import os

import torch
import torch.nn as nn
import torch.utils.data as data

class Configs(object):
@staticmethod
def base_config():
parser = argparse.ArgumentParser()
parser.add_argument("--classifier", type=str, default="vdpwi", choices=["vdpwi", "resnet"])
parser.add_argument("--clip_norm", type=float, default=50)
parser.add_argument("--cpu", action="store_true", default=False)
parser.add_argument("--dataset", type=str, default="sick", choices=["sick"])
parser.add_argument("--decay", type=float, default=0.95)
parser.add_argument("--input_file", type=str, default="local_saves/model.pt")
parser.add_argument("--lr", type=float, default=5E-4)
parser.add_argument("--mbatch_size", type=int, default=16)
parser.add_argument("--mode", type=str, default="train", choices=["train", "test"])
parser.add_argument("--momentum", type=float, default=0.1)
parser.add_argument("--n_epochs", type=int, default=35)
parser.add_argument("--n_labels", type=int, default=5)
parser.add_argument("--optimizer", type=str, default="rmsprop", choices=["adam", "sgd", "rmsprop"])
parser.add_argument("--output_file", type=str, default="local_saves/model.pt")
parser.add_argument("--res_fmaps", type=int, default=32)
parser.add_argument("--res_layers", type=int, default=16)
parser.add_argument("--restore", action="store_true", default=False)
parser.add_argument("--rnn_hidden_dim", type=int, default=250)
parser.add_argument("--weight_decay", type=float, default=1E-5)
parser.add_argument("--wordvecs_file", type=str, default="local_data/glove/glove.840B.300d.txt")
return parser.parse_known_args()[0]

@staticmethod
def sick_config():
parser = argparse.ArgumentParser()
parser.add_argument("--n_labels", type=int, default=5)
parser.add_argument("--sick_cache", type=str, default="local_data/sick/.vec-cache")
parser.add_argument("--sick_data", type=str, default="local_data/sick")
return parser.parse_known_args()[0]

class LabeledEmbeddedDataset(data.Dataset):
def __init__(self, sentence_indices1, sentence_indices2, labels, compare_labels=None):
assert len(sentence_indices1) == len(labels) == len(sentence_indices2)
self.sentence_indices1 = sentence_indices1
self.sentence_indices2 = sentence_indices2
self.labels = labels
self.compare_labels = compare_labels

def __getitem__(self, idx):
cmp_lbl = None if self.compare_labels is None else self.compare_labels[idx]
return self.sentence_indices1[idx], self.sentence_indices2[idx], self.labels[idx], cmp_lbl

def __len__(self):
return len(self.labels)

def load_sick():
config = Configs.sick_config()
def fetch_indices(name):
sentence_indices = []
filename = os.path.join(config.sick_data, dataset, name)
with open(filename) as f:
for line in f:
indices = [embed_ids.get(word, -1) for word in line.strip().split()]
indices = list(filter(lambda x: x >= 0, indices))
sentence_indices.append(indices)
return sentence_indices

def read_labels(filename):
labels = []
with open(filename) as f:
for line in f:
labels.append([float(val) for val in line.split()])
return labels

sets = []
embeddings = []
embed_ids = {}
with open(os.path.join(config.sick_cache)) as f:
for i, line in enumerate(f):
word, vec = line.split(" ", 1)
vec = list(map(float, vec.strip().split()))
embed_ids[word] = i
embeddings.append(vec)
padding_idx = len(embeddings)
embeddings.append([0.0] * 300)

for dataset in ("train", "dev", "test"):
sparse_filename = os.path.join(config.sick_data, dataset, "sim_sparse.txt")
truth_filename = os.path.join(config.sick_data, dataset, "sim.txt")
sparse_labels = read_labels(sparse_filename)
cmp_labels = read_labels(truth_filename)
indices1 = fetch_indices("a.toks")
indices2 = fetch_indices("b.toks")
sets.append(LabeledEmbeddedDataset(indices1, indices2, sparse_labels, cmp_labels))
embedding = nn.Embedding(len(embeddings), 300)
embedding.weight.data.copy_(torch.Tensor(embeddings))
embedding.weight.requires_grad = False
return embedding, sets

def load_dataset(dataset):
return _loaders[dataset]()

_loaders = dict(sick=load_sick)
Loading

0 comments on commit bb5cf71

Please sign in to comment.