In [11]:
from kedro.io import DataCatalog
from kedro.extras.datasets.pandas import CSVDataSet

In [12]:
context

<kedro.framework.context.context.KedroContext at 0x17b1127ff88>

## Set Up Data
#### check whether Kedro can load the data correctly

In [13]:
%reload_kedro

#When you add new datasets to your catalog.yml file you need to reload Kedro’s session by running %reload_kedro in your cell

2021-09-02 15:44:04,377 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2021-09-02 15:44:04,443 - root - INFO - ** Kedro project Kedro Tutorial
2021-09-02 15:44:04,445 - root - INFO - Defined global variable `context`, `session` and `catalog`
2021-09-02 15:44:04,461 - root - INFO - Registered line magic `run_viz`


In [16]:
companies = catalog.load("companies")
companies.head()

2021-09-02 15:47:44,025 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)...


Unnamed: 0,id,company_rating,company_location,total_fleet_count,iata_approved
0,35029,100%,Niue,4.0,f
1,30292,67%,Anguilla,6.0,f
2,19032,67%,Russian Federation,4.0,f
3,8238,91%,Barbados,15.0,t
4,30342,,Sao Tome and Principe,2.0,t


In [17]:
shuttles = catalog.load("shuttles")
shuttles.head()

2021-09-02 15:47:45,457 - kedro.io.data_catalog - INFO - Loading data from `shuttles` (ExcelDataSet)...


  return pd.read_excel(fs_file, **self._load_args)


Unnamed: 0,id,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,moon_clearance_complete,price,company_id
0,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,f,f,"$1,325.0",35029
1,36260,Anguilla,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,t,f,"$1,780.0",30292
2,57015,Russian Federation,Type V5,Quantum,ThetaBase Services,1.0,2,moderate,0.0,f,f,"$1,715.0",19032
3,14035,Barbados,Type V5,Plasma,ThetaBase Services,3.0,6,strict,3.0,f,f,"$4,770.0",8238
4,10036,Sao Tome and Principe,Type V2,Plasma,ThetaBase Services,2.0,4,strict,2.0,f,f,"$2,820.0",30342


## Create a pipeline

#### 1.Create Node

1) Use notebook to write node function\
2) Add tags as node\

3) convert .ipynb to .py using command:
> pip install ipython #skip if already installed\
> pip install nbconvert  #skip if already installed

##### command to convert:
```
jupyter nbconvert nodes.ipynb to python
```

##### Alternative - How to convert notebook cells to nodes in a Kedro project
You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#cell-tags) and Kedro CLI commands.

By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src/<package_name>/nodes/`:

```
kedro jupyter convert <filepath_to_my_notebook>
```
> *Note:* The name of the Python file matches the name of the original notebook.

Alternatively, you may want to transform all your notebooks in one go. Run the following command to convert all notebook files found in the project root directory and under any of its sub-folders:

```
kedro jupyter convert --all
```

In [18]:
# Node functions

# Create a file src/kedro_tutorial/pipelines/data_processing/nodes.py, adding the subfolders too if necessary.

from kedro.pipeline import *
from kedro.io import *
from kedro.runner import *

import pickle
import os

import pandas as pd


def _is_true(x):
    return x == "t"


def _parse_percentage(x):
    x = x.str.replace("%", "")
    x = x.astype(float) / 100
    return x


def _parse_money(x):
    x = x.str.replace("$", "").str.replace(",", "")
    x = x.astype(float)
    return x


def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for companies.

    Args:
        companies: Raw data.
    Returns:
        Preprocessed data, with `company_rating` converted to a float and
        `iata_approved` converted to boolean.
    """
    companies["iata_approved"] = _is_true(companies["iata_approved"])
    companies["company_rating"] = _parse_percentage(companies["company_rating"])
    return companies


def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for shuttles.

    Args:
        shuttles: Raw data.
    Returns:
        Preprocessed data, with `price` converted to a float and `d_check_complete`,
        `moon_clearance_complete` converted to boolean.
    """
    shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
    shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
    shuttles["price"] = _parse_money(shuttles["price"])
    return shuttles