In [None]:
#hide
#colab
# attach gdrive holding repo
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#default_exp multi_core.callback    

# Multi Core Callback XLA Extensions

> Patches to Recorder and ParamScheduler Callbacks
to support Multi Core XLA Training

Modifications to existing callback `Recorder`, `ParamScheduler` are needed in order to store extra attributes to a temporary file after running the multi core TPU training as spawned processes.  

In [None]:
#hide
#colab
# install pytorch 1.7.1 b/c fastai doesn't support pytorch 1.8 just yet
!pip install -Uqq --no-cache-dir torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchtext==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html


[K     |████████████████████████████████| 735.4MB 1.1MB/s 
[K     |████████████████████████████████| 12.8MB 55.4MB/s 
[K     |████████████████████████████████| 7.0MB 4.7MB/s 
[?25h

In [None]:
#hide
#colab
!pip install -Uqq cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp37-cp37m-linux_x86_64.whl

[K     |████████████████████████████████| 133.6MB 80kB/s 
[K     |████████████████████████████████| 61kB 2.8MB/s 
[31mERROR: earthengine-api 0.1.254 has requirement google-api-python-client>=1.12.1, but you'll have google-api-python-client 1.8.0 which is incompatible.[0m
[?25h

In [None]:
#hide
#colab
!curl -s https://course19.fast.ai/setup/colab | bash

Updating fastai...
Done.


In [None]:
#hide
#colab
# !pip install -Uqq git+https://github.com/fastai/fastai.git 
!pip install -Uqq fastai==2.3.0

[K     |████████████████████████████████| 194kB 5.2MB/s 
[K     |████████████████████████████████| 61kB 6.9MB/s 
[?25h

In [None]:
#hide
#colab
!pip install -qqq nbdev

[K     |████████████████████████████████| 51kB 2.7MB/s 
[K     |████████████████████████████████| 51kB 3.6MB/s 
[?25h

In [None]:
#hide
#colab
# !pip install -Uqq git+https://github.com/butchland/fastai_xla_extensions.git

In [None]:
#hide
#colab
# !pip install -Uqq git+https://github.com/butchland/my_timesaver_utils.git

In [None]:
#hide
#colab
%cd /content
!ln -s /content/drive/MyDrive/fastai_xla_extensions  fastai_xla_extensions

/content


In [None]:
#hide
!pip freeze | grep torch
!pip freeze | grep fast
!pip freeze | grep timesaver
!pip freeze | grep nbdev

torch==1.7.1+cu101
torch-xla==1.7
torchsummary==1.5.1
torchtext==0.8.0
torchvision==0.8.2+cu101
fastai==2.2.7
fastcore==1.3.19
fastdtw==0.3.4
fastprogress==1.0.0
fastrelease==0.1.11
fastrlock==0.5
nbdev==1.1.13


In [None]:
# hide
# start of kernel

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
#colab
%cd /content/fastai_xla_extensions

/content/drive/MyDrive/fastai_xla_extensions


In [None]:
#exporti
from fastai_xla_extensions.utils import xla_imported
from fastai_xla_extensions.misc_utils import *
from fastai_xla_extensions.multi_core.base import *
# from fastai_xla_extensions.multi_core.learner import *



In [None]:
#hide
#colab
%cd /content

/content


In [None]:
#exporti
try:
    import torch_xla
except:
    pass

In [None]:
#exporti
if xla_imported():
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.xla_multiprocessing as xmp

In [None]:
#hide
#local
# fake out torch_xla modules if not running on xla supported envs
if not xla_imported():
    # replace torch xla modules with fake equivalents
    from types import SimpleNamespace
    torch_xla = SimpleNamespace (
    )
    from typing import Union,BinaryIO
    import os
    import pickle
    import torch.cuda

    def fake_opt_step(opt,barrier=False):
        opt.step()
        
    def fake_device(n=None, devkind=None):
        gpu_available = torch.cuda.is_available()
        if gpu_available:
            return torch.device(torch.cuda.current_device()) 
        return torch.device('cpu')

    def fake_save(obj, f: Union[str, os.PathLike, BinaryIO], 
                master_only=True, global_master=False): 
        return torch.save(obj,f,pickle_module=pickle, 
                        pickle_protocol=2, 
                        _use_new_zipfile_serialization=True)
    def fake_rate():
        return 230.20

    def fake_global_rate():
        return 830.10

    def fake_add(*args,**kwargs):
        pass

    def fake_RateTracker():
        return SimpleNamespace(
            rate = fake_rate,
            global_rate = fake_global_rate,
            add = fake_add
        )
    def fake_xrt_world_size():
        return 1
    def fake_get_ordinal():
        return 0
    xm = SimpleNamespace(
        optimizer_step = fake_opt_step,
        xla_device = fake_device,
        save = fake_save,
        RateTracker = fake_RateTracker,
        master_print = print,
        xrt_world_size = fake_xrt_world_size,
        get_ordinal = fake_get_ordinal
    )

    def fake_metrics_report():
        return "Fake Metrics Report \n\n\n\n"
    met = SimpleNamespace (
        metrics_report = fake_metrics_report
    )

    class FakeParallelLoader:
        def __init__(self, loader, *args):
            self.loader = loader
        def per_device_loader(self,device):
            return self.loader
        
    pl = SimpleNamespace(
        ParallelLoader = FakeParallelLoader
    )

    def fake_MpModelWrapper(o):
        return o

    def fake_run(f,*args, **kwargs):
            return f(*args,**kwargs)
        
    def fake_MpSerialExecutor():
        return SimpleNamespace(
            run = fake_run
        )
    def fake_spawn(f, args=None, nprocs=0, start_method=None):
        return f(0,*args)

    xmp = SimpleNamespace (
        MpModelWrapper = fake_MpModelWrapper,
        MpSerialExecutor = fake_MpSerialExecutor,
        spawn = fake_spawn
    )

    xu = SimpleNamespace (
    )


In [None]:
#exporti
# from fastai.vision.all import *


In [None]:
#export
import torch
from fastcore.xtras import is_listy
def maybe_item(o):
    '''extract scalar values from a tensor, lists and dicts of tensors 
    (and pulling it out of gpu/tpu into cpu) else if not tensor just 
    use orig value'''
    if isinstance(o,torch.Tensor): return o.item()
    if is_listy(o):
        kls = o.__class__
        k = [maybe_item(i) for i in o]
        return kls(k)
    if isinstance(o,dict):
        return {k:maybe_item(v) for k,v in o.items()}
    # maybe scalar or object
    return o


In [None]:
from fastcore.test import *
t1 = torch.tensor(5.)
test_eq(maybe_item(t1), 5.)
test_eq(maybe_item(float(5)),5.)

Given a tensor, `maybe_item` converts it to a scalar. If given is not a tensor (e.g. already a scalar), it just returns the scalar.

In [None]:
from fastcore.test import *
from fastai.torch_core import tensor
tl1 = [tensor(2.)] * 5
test_eq(maybe_item(tl1), [2.] * 5)
dt1 = { 'd1': tensor(3.),
        'd2': [tensor(1.)] * 3}
df1 = { 'd1': 3.,
        'd2': [1.] * 3}
test_eq(maybe_item(dt1), df1)

`maybe_item` should also work for lists of tensors and dicts of tensors
and/or list of tensors.

In [None]:
#export
from fastai.learner import Recorder
from fastcore.basics import patch

@patch
def get_extra_attrs(self:Recorder):
    'Extract state attrs of Recorder into a dict (suitable for pickling)'
    # state_attrs = lrs','iters','losses','values'
    d = {}
    for attr in self._stateattrs:
        if hasattr(self,attr):
            value = getattr(self,attr)
            d[attr] = maybe_item(value)
    return d


In [None]:
#hide_input
show_doc(Recorder.get_extra_attrs)

<h4 id="Recorder.get_extra_attrs" class="doc_header"><code>Recorder.get_extra_attrs</code><a href="__main__.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>Recorder.get_extra_attrs</code>()

Extract state attrs of Recorder into a dict (suitable for pickling)

In [None]:
#hide_output
from fastai.test_utils import *
learner = synth_learner()
learner.fit(5)


epoch,train_loss,valid_loss,time
0,11.862452,10.9613,00:18
1,10.377245,7.764143,00:00
2,8.70736,5.063774,00:00
3,7.145759,3.160653,00:00
4,5.798985,1.90736,00:00


In [None]:
#hide
# setup checks
assert hasattr(learner,'recorder')
assert len(learner.recorder.lrs)  == 5 * 10
assert len(learner.recorder.losses) == 5 * 10
assert len(learner.recorder.iters) == 5
assert len(learner.recorder.values) == 5

In [None]:
extra_attrs = learner.recorder.get_extra_attrs()
test_eq(extra_attrs['lrs'], learner.recorder.lrs)
test_eq(extra_attrs['losses'], learner.recorder.losses)
test_eq(extra_attrs['iters'], learner.recorder.iters)
test_eq(extra_attrs['values'], learner.recorder.values)


`Recorder.get_extra_attrs` should copy the state attrs (`lrs`,`losses`,`iters` and `values`) into
a dict.

In [None]:
#export
import pickle
from fastai.learner import Recorder
from fastcore.basics import patch

@patch
def dump_attrs(self:Recorder, fn='_rec_attr.pkl'):
    'dump state attrs to a file'
    d = self.get_extra_attrs()
    with open(fn,'wb') as f:
        pickle.dump(d,f)


In [None]:
#export
import pickle
from fastai.learner import Recorder
from fastcore.basics import patch
from pathlib import Path

@patch
def reload_attrs(self:Recorder, fn='_rec_attr.pkl'):
    'reload attrs from file `fn`'
    if isinstance(fn,str):
        fn = Path(fn)
    if not fn.is_file():
        return
    with open(fn,'rb') as f:
        d = pickle.load(f)
        for k,v in d.items():
            setattr(self,k,v)
    fn.unlink()

In [None]:
test_fn = 'test_rec_attrs.pkl'
!rm -f {test_fn}
learner.recorder.dump_attrs(fn=test_fn)
f = Path(test_fn)
assert f.is_file()


In [None]:
delattr(learner.recorder,'lrs')
delattr(learner.recorder,'losses')
delattr(learner.recorder,'iters')
delattr(learner.recorder,'values')
assert not hasattr(learner.recorder,'lrs')
assert not hasattr(learner.recorder,'losses')
assert not hasattr(learner.recorder,'iters')
assert not hasattr(learner.recorder,'values')


In [None]:

learner.recorder.reload_attrs(fn=test_fn)
assert hasattr(learner.recorder,'lrs')
assert hasattr(learner.recorder,'losses')
assert hasattr(learner.recorder,'iters')
assert hasattr(learner.recorder,'values')
!rm -f {test_fn}

In [None]:
#export
from fastai.learner import Recorder
from fastcore.basics import patch

@patch
def after_fit(self: Recorder):
    'after fit dump extra attrs to file'
    if getattr(self.learn,'inner_xla',False) and self.learn.xla_rank == 0:
        self.dump_attrs()


In [None]:
#export
from fastai.callback.schedule import ParamScheduler
from fastcore.basics import patch
from pathlib import Path
import pickle

@patch
def dump_hps(self:ParamScheduler, fn='_paramsched_hps.pkl'):
    'dump `hps` to a file `fn`'
    if not hasattr(self, 'hps'): 
        return

    if isinstance(fn,str):
        fn = Path(fn)

    d = maybe_item(self.hps)
    with open(fn,'wb') as f:
        pickle.dump(d,f)


In [None]:
#hide_input
show_doc(ParamScheduler.dump_hps)

<h4 id="ParamScheduler.dump_hps" class="doc_header"><code>ParamScheduler.dump_hps</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>ParamScheduler.dump_hps</code>(**`fn`**=*`'_paramsched_hps.pkl'`*)

dump `hps` to a file `fn`

In [None]:
#export
from fastai.learner import Recorder
from fastcore.basics import patch
from pathlib import Path

@patch
def reload_hps(self:Recorder, fn='_paramsched_hps.pkl'):
    'Load hyperparameters saved by ParamScheduler to recorder'
    if isinstance(fn,str):
        fn = Path(fn)
    if not fn.is_file():
        return
    with open(fn,'rb') as f:
        d = pickle.load(f)
        setattr(self,'hps',d)
    fn.unlink()

In [None]:
#export
from fastai.callback.schedule import ParamScheduler
from fastcore.basics import patch

@patch
def after_fit(self:ParamScheduler):
    "save hps to file"
    if not hasattr(self,'hps'):
        return

    if hasattr(self.learn, 'recorder'): 
        self.recorder.hps = self.hps

    if getattr(self.learn,'inner_xla',False) and self.learn.xla_rank == 0:
        self.dump_hps()


In [None]:
#hide_input
show_doc(ParamScheduler.after_fit)

<h4 id="ParamScheduler.after_fit" class="doc_header"><code>ParamScheduler.after_fit</code><a href="__main__.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>ParamScheduler.after_fit</code>()

save hps to file

In [None]:
#hide_output
#colab
param_fn = '_paramsched_hps.pkl'
!rm -f {param_fn}
learner.inner_xla = True # simulate spawned process learner
learner.xla_rank = 0
learner.fit_one_cycle(3)

epoch,train_loss,valid_loss,time
0,1.703755,1.439683,00:00
1,1.424732,1.078809,00:00
2,1.253089,1.007215,00:00


In [None]:
#colab
param_f = Path(param_fn)
assert param_f.is_file()


In [None]:
#colab
delattr(learner.recorder,'hps')
assert not hasattr(learner.recorder,'hps')
learner.recorder.reload_hps()
assert hasattr(learner.recorder,'hps')
!rm -f {param_fn}
!rm -f _rec_attr.pkl

Test ParamScheduler (`fit_one_cycle` uses `ParamScheduler`) which means it should create a pickle file 

In [None]:
#colab
from fastcore.foundation import L
if 'progress' not in L(learner.cbs).attrgot('name'):
    learner.add_cbs(ProgressCallback)
learner.fit_one_cycle(5)

epoch,train_loss,valid_loss,time
0,0.928667,0.832751,00:00
1,0.785133,0.567121,00:00
2,0.662554,0.43365,00:00
3,0.572687,0.379,00:00
4,0.510633,0.367608,00:00


In [None]:
#colab
assert param_f.is_file()
rec_attr_f = Path('_rec_attr.pkl')
assert rec_attr_f.is_file()