# YAML demo

In [1]:
import sys
sys.executable

'/Users/mfedell/.local/share/virtualenvs/yaml-YhfRMv2F/bin/python'

In [2]:
import yaml
import pandas as pd

from src import data_cleaning

## Configuration file for importing

In [3]:
!cat config/config.yaml

data_cleaning:
  subset_data:
    columns:
      - name
      - grade
  clean_missing:
    value: 
      name: unknown
      grade: -1
  reorder:
    by: name
    ascending: False
  description: some long description can go here
  tags:
    - tag1
    - tag2
    - tag3    


## Load `config.yaml`

In [4]:
with open("config/config.yaml", "r") as f:
	config = yaml.safe_load(f)

In [5]:
config

{'data_cleaning': {'subset_data': {'columns': ['name', 'grade']},
  'clean_missing': {'value': {'name': 'unknown', 'grade': -1}},
  'reorder': {'by': 'name', 'ascending': False},
  'description': 'some long description can go here',
  'tags': ['tag1', 'tag2', 'tag3']}}

## Get configurations to use for `pyfileA`

In [6]:
config_data_cleaning = config["data_cleaning"]
config_data_cleaning

{'subset_data': {'columns': ['name', 'grade']},
 'clean_missing': {'value': {'name': 'unknown', 'grade': -1}},
 'reorder': {'by': 'name', 'ascending': False},
 'description': 'some long description can go here',
 'tags': ['tag1', 'tag2', 'tag3']}

## Get configuration for `functionA` within `pyfileA`

In [7]:
config_data_cleaning["subset_data"]

{'columns': ['name', 'grade']}

## Create test dataframe

In [8]:
data = {
    "name": ["Alice", "Bob", "Carly"],
    "classification": ["Junior", None, "Senior"],
    "grade": [99, None, None],
}
df = pd.DataFrame(data=data)
df

Unnamed: 0,name,classification,grade
0,Alice,Junior,99.0
1,Bob,,
2,Carly,Senior,


## Use dataframe and configurations as input to `subset_data`

In [9]:
data_cleaning.subset_data(df, columns=['name', 'grade'])

Unnamed: 0,name,grade
0,Alice,99.0
1,Bob,
2,Carly,


In [10]:
df = data_cleaning.subset_data(df, **config_data_cleaning["subset_data"])
df

Unnamed: 0,name,grade
0,Alice,99.0
1,Bob,
2,Carly,


## Use dataframe and configurations as input to `clean_missing`

In [11]:
config_data_cleaning["clean_missing"]

{'value': {'name': 'unknown', 'grade': -1}}

In [12]:
df = data_cleaning.clean_missing(df, **config_data_cleaning["clean_missing"])
df

Unnamed: 0,name,grade
0,Alice,99.0
1,Bob,-1.0
2,Carly,-1.0


## Use dataframe and configurations as input to `reorder`

In [13]:
df = data_cleaning.reorder(df, **config_data_cleaning["reorder"])
df

Unnamed: 0,name,grade
2,Carly,-1.0
1,Bob,-1.0
0,Alice,99.0


### Quick note on `*` and `**` notation in python

Python offers two special ways to "expand" certain objects.

A single `*` can be prepended to an iterable (tuple, list, etc) to "unpack" that variable.

A double `**` can be prepended to a dict to "expand" the dict to key-value pairs passes as keyword arguments to a function.

On the other side, you can define `*args` and `**kwargs` in a function's signature to allow users to pass in arbitrary parameters. These will be picked up after all required arguments.

In [14]:
def foo(*args):
    print(f"{len(args)} args were passed\n")
    print("\n".join(str(a) for a in args))

def bar(**kwargs):
    print(f"{len(kwargs)} kwargs were passed")
    print("\n".join(f"{a}: {kwargs[a]}" for a in kwargs))

In [15]:
arguments = [1, 2, 3]
foo(arguments)
print("\n" + "=" * 20 + "\n")
foo(*arguments)

1 args were passed

[1, 2, 3]


3 args were passed

1
2
3


In [16]:
[arguments]

[[1, 2, 3]]

In [17]:
[*arguments]

[1, 2, 3]

In [18]:
params = {"a": 1, "b": 2}
foo(params)
print("\n" + "=" * 20 + "\n")
foo(*params)
print("\n" + "=" * 20 + "\n")
bar(**params)

1 args were passed

{'a': 1, 'b': 2}


2 args were passed

a
b


2 kwargs were passed
a: 1
b: 2


In [19]:
list(params)

['a', 'b']