Merge remote-tracking branch 'vdpwi/master'

castorini · May 23, 2018 · bb5cf71 · bb5cf71
2 parents cc53e60 + 73823fc
commit bb5cf71
Show file tree

Hide file tree

Showing 10 changed files with 583 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Ralph Tang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/vdpwi.sublime-project b/vdpwi.sublime-project
@@ -0,0 +1,8 @@
+{
+	"folders":
+	[
+		{
+			"path": "vdpwi"
+		}
+	]
+}
diff --git a/vdpwi/__main__.py b/vdpwi/__main__.py
@@ -0,0 +1,139 @@
+from collections import namedtuple
+
+from tqdm import tqdm
+import numpy as np
+import scipy.stats as stats
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils as utils
+
+from utils.log import LogWriter
+import data
+import model as mod
+
+Context = namedtuple("Context", "model, train_loader, dev_loader, test_loader, optimizer, criterion, params, log_writer")
+EvaluateResult = namedtuple("EvaluateResult", "pearsonr, spearmanr")
+
+def create_context(config):
+    def collate_fn(batch):
+        emb1 = []
+        emb2 = []
+        labels = []
+        cmp_labels = []
+        pad_cube = []
+        max_len1 = 0; max_len2 = 0
+
+        for s1, s2, l, cl in batch:
+            emb1.append(s1)
+            emb2.append(s2)
+            max_len1 = max(max_len1, len(s1))
+            max_len2 = max(max_len2, len(s2))
+            labels.append(l)
+            cmp_labels.append(cl)
+
+        for s1, s2 in zip(emb1, emb2):
+            pad1 = (max_len1 - len(s1))
+            pad2 = (max_len2 - len(s2))
+            pad_mask = np.ones((max_len1, max_len2))
+            pad_mask[:len(s1), :len(s2)] = 0
+            pad_cube.append(pad_mask)
+            s1.extend([embedding.weight.size(0) - 1] * pad1)
+            s2.extend([embedding.weight.size(0) - 1] * pad2)
+
+        pad_cube = np.array(pad_cube)
+        emb1 = torch.LongTensor(emb1)
+        emb2 = torch.LongTensor(emb2)
+        labels = torch.Tensor(labels)
+        emb1 = torch.autograd.Variable(emb1, requires_grad=False)
+        emb2 = torch.autograd.Variable(emb2, requires_grad=False)
+        labels = torch.autograd.Variable(labels, requires_grad=False)
+        pad_cube = torch.autograd.Variable(torch.from_numpy(pad_cube).float(), requires_grad=False)
+        if not config.cpu:
+            emb1 = emb1.cuda()
+            emb2 = emb2.cuda()
+            labels = labels.cuda()
+            pad_cube = pad_cube.cuda()
+        return emb1, emb2, labels, pad_cube, cmp_labels
+
+    embedding, (train_set, dev_set, test_set) = data.load_dataset(config.dataset)
+    model = mod.VDPWIModel(embedding, config)
+    if config.restore:
+        model.load(config.input_file)
+    if not config.cpu:
+        model = model.cuda()
+
+    train_loader = utils.data.DataLoader(train_set, shuffle=True, batch_size=config.mbatch_size, collate_fn=collate_fn)
+    dev_loader = utils.data.DataLoader(dev_set, batch_size=1, collate_fn=collate_fn)
+    test_loader = utils.data.DataLoader(test_set, batch_size=1, collate_fn=collate_fn)
+
+    params = list(filter(lambda x: x.requires_grad, model.parameters()))
+    if config.optimizer == "adam":
+        optimizer = optim.Adam(params, lr=config.lr, weight_decay=config.weight_decay)
+    elif config.optimizer == "sgd":
+        optimizer = optim.SGD(params, lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay)
+    elif config.optimizer == "rmsprop":
+        optimizer = optim.RMSprop(params, lr=config.lr, alpha=config.decay, momentum=config.momentum, weight_decay=config.weight_decay)
+    criterion = nn.KLDivLoss()
+    log_writer = LogWriter()
+    return Context(model, train_loader, dev_loader, test_loader, optimizer, criterion, params, log_writer)
+
+def test(config):
+    context = create_context(config)
+    result = evaluate(context, context.test_loader)
+    print("Final test result: {}".format(result))
+
+def evaluate(context, data_loader):
+    model = context.model
+    model.eval()
+    predictions = []
+    true_labels = []
+    for sent1, sent2, _, pad_cube, truth in data_loader:
+        scores = model(sent1, sent2, pad_cube)
+        scores = F.softmax(scores).cpu().data.numpy()[0]
+        prediction = np.dot(np.arange(1, len(scores) + 1), scores)
+        predictions.append(prediction); true_labels.append(truth[0][0])
+
+    pearsonr = stats.pearsonr(predictions, true_labels)[0]
+    spearmanr = stats.spearmanr(predictions, true_labels)[0]
+    context.log_writer.log_dev_metrics(pearsonr, spearmanr)
+    return EvaluateResult(pearsonr, spearmanr)
+
+def train(config):
+    context = create_context(config)
+    context.log_writer.log_hyperparams()
+    best_dev_pr = 0
+    for epoch_no in range(config.n_epochs):
+        print("Epoch number: {}".format(epoch_no + 1))
+        loader_wrapper = tqdm(context.train_loader, total=len(context.train_loader), desc="Loss")
+        context.model.train()
+        loss = 0
+        for sent1, sent2, label_pmf, pad_cube, _ in loader_wrapper:
+            context.optimizer.zero_grad()
+            scores = F.log_softmax(context.model(sent1, sent2, pad_cube))
+
+            loss = context.criterion(scores, label_pmf)
+            loss.backward()
+            nn.utils.clip_grad_norm(context.params, config.clip_norm)
+            context.optimizer.step()
+
+            loss = loss.cpu().data[0]
+            loader_wrapper.set_description("Loss: {:<8}".format(round(loss, 5)))
+            context.log_writer.log_train_loss(loss)
+        result = evaluate(context, context.dev_loader)
+        print("Dev result: {}".format(result))
+        if best_dev_pr < result.pearsonr:
+            best_dev_pr = result.pearsonr
+            print("Saving best model...")
+            context.model.save(config.output_file)
+
+def main():
+    config = data.Configs.base_config()
+    if config.mode == "train":
+        train(config)
+    elif config.mode == "test":
+        test(config)
+
+if __name__ == "__main__":
+    main()
diff --git a/vdpwi/data.py b/vdpwi/data.py
@@ -0,0 +1,104 @@
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+import torch.utils.data as data
+
+class Configs(object):
+    @staticmethod
+    def base_config():
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--classifier", type=str, default="vdpwi", choices=["vdpwi", "resnet"])
+        parser.add_argument("--clip_norm", type=float, default=50)
+        parser.add_argument("--cpu", action="store_true", default=False)
+        parser.add_argument("--dataset", type=str, default="sick", choices=["sick"])
+        parser.add_argument("--decay", type=float, default=0.95)
+        parser.add_argument("--input_file", type=str, default="local_saves/model.pt")
+        parser.add_argument("--lr", type=float, default=5E-4)
+        parser.add_argument("--mbatch_size", type=int, default=16)
+        parser.add_argument("--mode", type=str, default="train", choices=["train", "test"])
+        parser.add_argument("--momentum", type=float, default=0.1)
+        parser.add_argument("--n_epochs", type=int, default=35)
+        parser.add_argument("--n_labels", type=int, default=5)
+        parser.add_argument("--optimizer", type=str, default="rmsprop", choices=["adam", "sgd", "rmsprop"])
+        parser.add_argument("--output_file", type=str, default="local_saves/model.pt")
+        parser.add_argument("--res_fmaps", type=int, default=32)
+        parser.add_argument("--res_layers", type=int, default=16)
+        parser.add_argument("--restore", action="store_true", default=False)
+        parser.add_argument("--rnn_hidden_dim", type=int, default=250)
+        parser.add_argument("--weight_decay", type=float, default=1E-5)
+        parser.add_argument("--wordvecs_file", type=str, default="local_data/glove/glove.840B.300d.txt")
+        return parser.parse_known_args()[0]
+
+    @staticmethod
+    def sick_config():
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--n_labels", type=int, default=5)
+        parser.add_argument("--sick_cache", type=str, default="local_data/sick/.vec-cache")
+        parser.add_argument("--sick_data", type=str, default="local_data/sick")
+        return parser.parse_known_args()[0]
+
+class LabeledEmbeddedDataset(data.Dataset):
+    def __init__(self, sentence_indices1, sentence_indices2, labels, compare_labels=None):
+        assert len(sentence_indices1) == len(labels) == len(sentence_indices2)
+        self.sentence_indices1 = sentence_indices1
+        self.sentence_indices2 = sentence_indices2
+        self.labels = labels
+        self.compare_labels = compare_labels
+
+    def __getitem__(self, idx):
+        cmp_lbl = None if self.compare_labels is None else self.compare_labels[idx]
+        return self.sentence_indices1[idx], self.sentence_indices2[idx], self.labels[idx], cmp_lbl
+
+    def __len__(self):
+        return len(self.labels)
+
+def load_sick():
+    config = Configs.sick_config()
+    def fetch_indices(name):
+        sentence_indices = []
+        filename = os.path.join(config.sick_data, dataset, name)
+        with open(filename) as f:
+            for line in f:
+                indices = [embed_ids.get(word, -1) for word in line.strip().split()]
+                indices = list(filter(lambda x: x >= 0, indices))
+                sentence_indices.append(indices)
+        return sentence_indices
+
+    def read_labels(filename):
+        labels = []
+        with open(filename) as f:
+            for line in f:
+                labels.append([float(val) for val in line.split()])
+        return labels
+
+    sets = []
+    embeddings = []
+    embed_ids = {}
+    with open(os.path.join(config.sick_cache)) as f:
+        for i, line in enumerate(f):
+            word, vec = line.split(" ", 1)
+            vec = list(map(float, vec.strip().split()))
+            embed_ids[word] = i
+            embeddings.append(vec)
+    padding_idx = len(embeddings)
+    embeddings.append([0.0] * 300)
+
+    for dataset in ("train", "dev", "test"):
+        sparse_filename = os.path.join(config.sick_data, dataset, "sim_sparse.txt")
+        truth_filename = os.path.join(config.sick_data, dataset, "sim.txt")
+        sparse_labels = read_labels(sparse_filename)
+        cmp_labels = read_labels(truth_filename)
+        indices1 = fetch_indices("a.toks")
+        indices2 = fetch_indices("b.toks")
+        sets.append(LabeledEmbeddedDataset(indices1, indices2, sparse_labels, cmp_labels))
+    embedding = nn.Embedding(len(embeddings), 300)
+    embedding.weight.data.copy_(torch.Tensor(embeddings))
+    embedding.weight.requires_grad = False
+    return embedding, sets
+
+def load_dataset(dataset):
+    return _loaders[dataset]()
+
+_loaders = dict(sick=load_sick)