In [1]:
import torch
import numpy as np
import pandas as pd
import os
import h5py
from exabiome.nn.loader import read_dataset, LazySeqDataset
import argparse
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from model import *
from data import *
from fastai.text.all import *

In [2]:
hparams = argparse.Namespace(**{'load': False,
                                'window': 4096,
                                'step': 4096,
                                'classify': True,
                                'tgt_tax_lvl': "phylum",
                                'fwd_only': True})

In [3]:
def get_toy_dl(hparams, batch_size=16):
    path = '/global/homes/a/azaidi/ar122_r202.toy.input.h5'
    chunks = LazySeqDataset(hparams, path=path,
                           keep_open=True)
    ds = taxon_ds(chunks, old_pad_seq)
    return DataLoader(ds, batch_size=batch_size, 
                      shuffle=True), ds

In [4]:
path = '/global/homes/a/azaidi/ar122_r202.toy.input.h5'
chunks = LazySeqDataset(hparams, path=path,
                           keep_open=True)
ds = taxon_ds(chunks, old_pad_seq)

In [6]:
#xs = [ds[x][0] for x in range(len(ds))]
#ys = [ds[x][1] for x in range(len(ds))]
#torch.stack(xs).shape, torch.stack(ys).shape

In [None]:
x_df = pd.DataFrame(torch.stack(xs).squeeze(1))
y_df = pd.DataFrame(torch.stack(ys))
x_df.shape, y_df.shape

In [53]:
torch.Tensor(x_df.iloc[0].values)

tensor([1., 1., 9.,  ..., 0., 1., 1.])

In [54]:
torch.Tensor(y_df.iloc[0].values)

tensor([15.])

In [126]:
y_df.iloc[0].values.item()

15

In [178]:
torch.Tensor(y_df.iloc[0])

tensor([15.])

In [179]:
class fast_ds(Dataset):
    def __init__(self, x_df, y_df):
        self.x = x_df
        self.y = y_df
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = torch.Tensor(self.x.iloc[idx].values)
        y = torch.Tensor(self.y.iloc[idx])
        return x.unsqueeze(0), y.to(torch.long)#.squeeze(1)

In [180]:
ds = fast_ds(x_df, y_df)
len(ds)

19010

In [181]:
ds[0],ds[0][0].dtype, ds[0][1].dtype

((tensor([[1., 1., 9.,  ..., 0., 1., 1.]]), tensor([15])),
 torch.float32,
 torch.int64)

In [155]:
ds[0][0].shape, ds[0][1].shape

(torch.Size([1, 4096]), torch.Size([1]))

In [172]:
ds[0]

(tensor([[1., 1., 9.,  ..., 0., 1., 1.]]), tensor(15))

In [188]:
dl = DataLoader(ds, batch_size=128, shuffle=True)
len(dl)

149

In [189]:
batch = next(iter(dl))
batch[0].shape, batch[1].shape

(torch.Size([128, 1, 4096]), torch.Size([128, 1]))

In [193]:
out = model(batch[0])
out.shape

torch.Size([128, 1, 18])

In [195]:
nn.CrossEntropyLoss()(out.squeeze(1), batch[1].squeeze(1))

tensor(3.0868, grad_fn=<NllLossBackward>)

In [7]:
dl,ds = get_toy_dl(hparams, batch_size=128)
batch = next(iter(dl))
len(dl), batch[0].shape, batch[1].shape

(149, torch.Size([128, 1, 4096]), torch.Size([128]))

In [8]:
ds[0],ds[0][0].dtype, ds[0][1].dtype

((tensor([[1., 1., 9.,  ..., 0., 1., 1.]]), tensor(15)),
 torch.float32,
 torch.int64)

In [None]:
ds[0], ds[0][0].dtype, ds[0][1].dtype

In [9]:
def get_model():
    model = nn.Sequential(
        get_base_layer(),
        get_dep_sep(32,16),
        get_inv_res(16, 12),
        get_head_layer(12, 1,
                    lin_out_feats=18))
    return model

In [10]:
model = get_model()

In [11]:
dls = DataLoaders(dl, dl)

As a reminder, we're just playing around here -- a proper validation set should be used and not just the same dataloader as the training set :)

In [8]:
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss())

Trying to train this model will not work -- out of the box -- this can be confirmed by uncommenting the line below

In [9]:
#learn.fit(1)

We'll need to utilize fast.ai's callback system in order to make sure out training loop includes that dimension reduction with the squeeze function, on our models output -- before it get's fed into our loss function. You can find out more about callbacks here: https://docs.fast.ai/callback.core.html

In [12]:
class compress_cb(Callback):
    #def before_batch(self):
    #    self.learn.yb = self.y.squeeze(1)
    def after_pred(self):
        #print(self.learn.yb.shape)
        self.learn.pred = self.pred.squeeze(1)
        #self.learn.yb = self.yb[0].squeeze(1)
        #print(self.learn.yb.shape)

In [None]:
dls.to('cuda');
model.to('cuda');

In [13]:
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(),
               cbs=[compress_cb], metrics=[accuracy])#.to_fp16()

In [28]:
torch.optim.Adam(model.params)

AttributeError: 'Sequential' object has no attribute 'params'

In [227]:
learn.lr_find()

TypeError: forward() takes 3 positional arguments but 130 were given

In [109]:
learn.fit_one_cycle(2, 1e-1)

epoch,train_loss,valid_loss,accuracy,time


AttributeError: 'tuple' object has no attribute 'squeeze'

That was super easy! With Fast.AI we can access useful things like the learning rate finder: https://fastai1.fast.ai/callbacks.lr_finder.html: https://arxiv.org/abs/1506.01186

We also get a ton of other useful functionality, like tables for our results + tools to inspect/diagnose how well our model is doing + easy to use techniques to train our model -- for example: <br>
(1) using mixed precision training only requires the ".to_fp16()" as seen in our Learner call<br>
(2) One cycle training is implemented in our fit call above

In [79]:
#gpu is clearly available
#!nvidia-smi

Need to look into why library is having trouble assigning everything to gpu automatically -- perhaps bc using cluster?