In [9]:
import json
import nbformat as nbf

In [10]:
import os
import inspect

In [11]:
parameters = context.catalog.load('parameters')

2020-12-25 14:19:39,666 - kedro.io.data_catalog - INFO - Loading data from `parameters` (MemoryDataSet)...


In [12]:
def create_notebook(cells, name):
    nb = nbf.v4.new_notebook()
    cells = [(nbf.v4.new_code_cell(c) if isinstance(c, str) else c) for c in cells]
    nb['cells'] = cells
    nbf.write(nb, name)

In [13]:
def extract_imports(text):
    imports = []
    for line in text.split('\n'):
        if (line.startswith('import') 
            or line.startswith('from') 
            and (not 'from .' in line)
            and not ('kedro' in line)
            and not 'cassava' in line):
            imports.append(line.strip())
    return imports

def get_imports():
    imports = []
    path = '../src/cassava/pipelines'
    for dirname in os.listdir(path):
        if not os.path.isdir(os.path.join(path, dirname)) or dirname.startswith('_'):
            continue
        
        for fname in os.listdir(os.path.join(path, dirname)):
            if not fname.endswith('.py'):
                continue
            fpath = os.path.join(path, dirname, fname)
            with open(fpath) as f:
                file_imports = extract_imports(f.read())
                imports += file_imports
    imports = list(set(imports))
    return '\n'.join(imports)
            
            
def get_helper_files_cells(target_files):
    cells = []
    
    path = '../src/cassava/'
    
    for fname in target_files:
        fpath = os.path.join(path, fname)
        
        with open(fpath) as f:
            file_lines = [line for line in f if not line.startswith('from cassava')]
            file_text = ''.join(file_lines)
            cells.append(nbf.v4.new_code_cell(f'# file {fname}\n\n{file_text}'))
    return cells

    
def get_pipeline_def_cell(pipeline, name):
    funcs = [
        f"#Pipeline {name}"
    ]
    for node in pipeline.nodes:
        func_source_node = inspect.getsource(node._func)
        funcs.append(func_source_node)
        
    cell_code = "\n\n".join(funcs)
    return nbf.v4.new_code_cell(cell_code)

def get_pipeline_execution_cells(pipeline):
    cells = []
    nodesets = pipeline._topo_sorted_nodes
    for nodeset in nodesets:
        while nodeset:
            node = nodeset.pop()
            if node.outputs:
                node_code = f"""{", ".join(node.outputs)} = {node._func_name}({", ".join(node.inputs)})"""
            else:
                node_code = f"""{node._func_name}({", ".join(node.inputs)})"""
            
            cells.append(nbf.v4.new_code_cell(node_code))
    return cells
        
    
def get_notebook_cells(parameters, 
                       pipelines,
                       initial_cells, 
                       extra_cells, 
                       final_cells,
                       file_list):
    imports_cell = nbf.v4.new_code_cell(get_imports())
    
    helper_files_cells = get_helper_files_cells(file_list)
    parameters_cell = nbf.v4.new_code_cell(f"""parameters = {json.dumps(parameters, indent=4, default=str)}""")
    
    pipeline_def_cells = []
    for name, pipeline in pipelines.items():
        if name.startswith('_'):
            continue
        pipeline_def_cells.append(get_pipeline_def_cell(pipeline, name))
    
    pipeline_exec_cells = []
    for name, pipeline in pipelines.items():
        if not name == '__submit__':
            continue
        pipeline_exec_cells += get_pipeline_execution_cells(pipeline)
    return [imports_cell,
            *initial_cells,
            nbf.v4.new_markdown_cell("# Functions"),
            *helper_files_cells,
            *pipeline_def_cells,
            nbf.v4.new_markdown_cell("# Parameters"),
            parameters_cell, 
            *extra_cells,
            nbf.v4.new_markdown_cell("# Execution"),
           *pipeline_exec_cells,
           *final_cells]

In [20]:
models_cell = """
!ls /kaggle/input/timm-pretrained-efficientnet
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp /kaggle/input/timm-pretrained-efficientnet/efficientnet/efficientnet_b0_ra-3dd342df.pth /root/.cache/torch/hub/checkpoints/efficientnet_b0_ra-3dd342df.pth
"""

installs_cell = """
!pip install /kaggle/input/timm-package/timm-0.1.26-py3-none-any.whl
"""

initial_cells = ['%matplotlib inline',
"""
import logging
import sys
logging.getLogger().addHandler(logging.StreamHandler())
""",
                                      models_cell, 
                                      installs_cell]

data_cell = nbf.v4.new_code_cell("""
DATA_DIR = '/kaggle/input/cassava-leaf-disease-classification'
MODELS_DIR = '/kaggle/input/cassava-models'

train_labels = pd.read_csv(f'{DATA_DIR}/train.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
label_num_to_disease_map = pd.read_csv(f'{DATA_DIR}/label_num_to_disease_map.json')

test_images_torch_2019 = CassavaDataset(image_ids=sample_submission.image_id.values, labels=sample_submission.label.values, root=f'{DATA_DIR}/test_images')

submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

cv_results = {
    'fold_0': {
        'model_path': os.path.join(MODELS_DIR, 'model_fold_0.pt')
    },
    'fold_1': {
        'model_path': os.path.join(MODELS_DIR, 'model_fold_1.pt')
    },
    'fold_2': {
        'model_path': os.path.join(MODELS_DIR, 'model_fold_2.pt')
    },
    'fold_3': {
        'model_path': os.path.join(MODELS_DIR, 'model_fold_3.pt')
    }
}
""")



final_cells = [
               nbf.v4.new_code_cell("submission.to_csv('submission.csv', index=False)"),
              ]

In [21]:
file_list = ['transforms.py', 'utils.py', 'models/model.py', 'models/byol.py', 'node_helpers.py']

In [23]:
create_notebook(
    get_notebook_cells(parameters, 
                       context.pipelines, 
                       initial_cells=initial_cells,
                       extra_cells=[data_cell], 
                       final_cells=final_cells,
                       file_list=file_list), 
    'submission.ipynb')