In [1]:
import argparse
import time
import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler

import data
from model import RNNModel

from utils import batchify, get_batch, repackage_hidden


device = 'cuda' if torch.cuda.is_available else 'cpu'

In [2]:
from argparse import Namespace

In [3]:
args = Namespace(alpha=2, batch_size=20, beta=1, bptt=70, 
                 chunk_size=10, clip=0.25, cuda=True, 
                 data='data/penn/', dropout=0.45, dropoute=0.1, 
                 dropouth=0.3, dropouti=0.5, emsize=400, epochs=3, 
                 finetuning=500, log_interval=200, lr=30, model='LSTM', 
                 nhid=1150, nlayers=3, nonmono=5, optimizer='sgd', 
                 philly=False, resume='', save='15590356561927629.pt', 
                 seed=141, wdecay=1.2e-06, wdrop=0.45, when=[-1])

In [4]:
args.tied = True

# Set the random seed manually for reproducibility.
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        torch.cuda.manual_seed(args.seed)

In [5]:

def model_save(fn):
    if args.philly:
        fn = os.path.join(os.environ['PT_OUTPUT_DIR'], fn)
    with open(fn, 'wb') as f:
        torch.save([model, criterion, optimizer], f)


def model_load(fn):
    global model, criterion, optimizer
    if args.philly:
        fn = os.path.join(os.environ['PT_OUTPUT_DIR'], fn)
    with open(fn, 'rb') as f:
        model, criterion, optimizer = torch.load(f)


import os
import hashlib

fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest())
if args.philly:
    fn = os.path.join(os.environ['PT_OUTPUT_DIR'], fn)
if os.path.exists(fn):
    print('Loading cached dataset...')
    corpus = torch.load(fn)
else:
    print('Producing dataset...')
    corpus = data.Corpus(args.data)
    torch.save(corpus, fn)

eval_batch_size = 10
test_batch_size = 1
train_data = batchify(corpus.train, args.batch_size, args)
val_data = batchify(corpus.valid, eval_batch_size, args)
test_data = batchify(corpus.test, test_batch_size, args)


Loading cached dataset...


In [6]:
from splitcross import SplitCrossEntropyLoss

criterion = None

ntokens = len(corpus.dictionary)
model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.chunk_size, args.nlayers,
                       args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied)
###
if args.resume:
    print('Resuming model ...')
    model_load(args.resume)
    optimizer.param_groups[0]['lr'] = args.lr
    model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute
    if args.wdrop:
        for rnn in model.rnn.cells:
            rnn.hh.dropout = args.wdrop

In [7]:
if not criterion:
    splits = []
    if ntokens > 500000:
        # One Billion
        # This produces fairly even matrix mults for the buckets:
        # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422
        splits = [4200, 35000, 180000]
    elif ntokens > 75000:
        # WikiText-103
        splits = [2800, 20000, 76000]
    print('Using', splits)
    criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False)

Using []


In [8]:
if args.cuda:
    model = model.cuda()
    criterion = criterion.cuda()
###
params = list(model.parameters()) + list(criterion.parameters())
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size())
print('Args:', args)
print('Model total parameters:', total_params)


Args: Namespace(alpha=2, batch_size=20, beta=1, bptt=70, chunk_size=10, clip=0.25, cuda=True, data='data/penn/', dropout=0.45, dropoute=0.1, dropouth=0.3, dropouti=0.5, emsize=400, epochs=3, finetuning=500, log_interval=200, lr=30, model='LSTM', nhid=1150, nlayers=3, nonmono=5, optimizer='sgd', philly=False, resume='', save='15590356561927629.pt', seed=141, tied=True, wdecay=1.2e-06, wdrop=0.45, when=[-1])
Model total parameters: 25232180


In [9]:

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output, hidden = model(data, hidden)
        total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)




# Loop over epochs.
lr = args.lr
best_val_loss = []
stored_loss = 100000000

In [10]:
###############################################################################
# Training code
###############################################################################

def evaluate(data_source, batch_size=10):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for i in range(0, data_source.size(0) - 1, args.bptt):
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output, hidden = model(data, hidden)
        total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item() / len(data_source)

In [11]:
optimizer = None
# Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax)
if args.optimizer == 'sgd':
    optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay)
if args.optimizer == 'adam':
    optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0, 0.999), eps=1e-9, weight_decay=args.wdecay)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 0.5, patience=2, threshold=0)


In [12]:
for epoch in range(1, args.epochs + 1):
    epoch_start_time = time.time()

    
    # Turn on training mode which enables dropout.
    if args.model == 'QRNN': model.reset()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    
    model = nn.DataParallel(model).to(device)
    batch, i = 0, 0
    while i < train_data.size(0) - 1 - 1:
        bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.
        # Prevent excessively small or negative sequence lengths
        seq_len = max(5, int(np.random.normal(bptt, 5)))
        # There's a very small chance that it could select a very long sequence length resulting in OOM
        # seq_len = min(seq_len, args.bptt + 10)

        lr2 = optimizer.param_groups[0]['lr']
        optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt
        model.train()
        data, targets = get_batch(train_data, i, args, seq_len=seq_len)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        optimizer.zero_grad()

        output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True)
        # output, hidden = model(data, hidden, return_h=False)
        raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets)

        loss = raw_loss
        # Activiation Regularization
        if args.alpha:
            loss = loss + sum(
                args.alpha * dropped_rnn_h.pow(2).mean()
                for dropped_rnn_h in dropped_rnn_hs[-1:]
            )
        # Temporal Activation Regularization (slowness)
        if args.beta:
            loss = loss + sum(
                args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean()
                for rnn_h in rnn_hs[-1:]
            )
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip)
        optimizer.step()

        total_loss += raw_loss.data
        optimizer.param_groups[0]['lr'] = lr2
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss.item() / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'],
                              elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()
        ###
        batch += 1
        i += seq_len


    

input True
weight False
bias True
inputinput True
weight  False
True
weight False
bias True
bias True
input True
weight False
bias True


RuntimeError: The size of tensor a (20) must match the size of tensor b (5) at non-singleton dimension 0

In [None]:
for n, p in model.named_parameters():
    print(n, p.is_cuda)

In [25]:
for n, m in model.module.rnn.named_modules():
    print(n, m._weights)

AttributeError: 'ONLSTMStack' object has no attribute '_weights'

In [44]:
model.module.rnn.cells[0].hh

LinearDropConnect(in_features=4600, out_features=4830, bias=True)

In [45]:
data.shape

torch.Size([67, 20])

In [14]:
x = LinearDropConnect(3, 5)

In [15]:
x._weight

Parameter containing:
tensor([[-0.2350,  0.3877, -0.1071],
        [ 0.1137,  0.5353,  0.1758],
        [-0.5153,  0.4023, -0.1565],
        [ 0.3421, -0.4718,  0.3676],
        [-0.0455, -0.3946,  0.1362]])

In [16]:
x.weight.masked_fill(mast)

NameError: name 'mast' is not defined

In [13]:
from ON_LSTM import ONLSTMStack

In [16]:
x = torch.Tensor(10, 10, 10).cuda()
x.data.normal_().cuda()
lstm = ONLSTMStack([10, 10, 10], chunk_size=10).cuda()
print(lstm(x, lstm.init_hidden(10))[1])

input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
input True
weight True
bias True
[(tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,