### Data Handling

In [1]:
import glob
import shutil
import os

import pandas as pd

In [2]:
# preparations
folder_in = './raw'
folder_out = folder_in.replace('raw', 'processed')
name_pattern = '*.csv'

os.makedirs(folder_in, exist_ok=True)
shutil.rmtree(folder_out, ignore_errors=True)
os.makedirs(folder_out, exist_ok=True)

In [3]:
# fake up some data
df = pd.DataFrame({'a': [0, 1], 'b': [10, 11]})

names = ['file1', 'file2']

for i, name in enumerate(names):
    df_i = df.copy() + i
    path = os.path.join(folder_in, name_pattern.replace('*', name))
    df_i.to_csv(path)

In [4]:
def extract_key(path):
    return os.path.splitext(os.path.basename(path))[0]

In [5]:
def create_path_mapping(folder_in, folder_out, name_pattern):
    
    path_mapping = {}
    paths_in = glob.glob(os.path.join(folder_in, name_pattern))
    for path_in in paths_in:
        key = extract_key(path_in)
        path_out = os.path.join(folder_out, os.path.basename(path_in))
        path_mapping[key] = {'in': path_in, 'out': path_out}

    return path_mapping

#### <font color="red">How to map the data?</font>

In [6]:
path_mapping = create_path_mapping(folder_in, folder_out, name_pattern)
path_mapping

{'file2': {'in': './raw/file2.csv', 'out': './processed/file2.csv'},
 'file1': {'in': './raw/file1.csv', 'out': './processed/file1.csv'}}

#### <font color="red">How to load the data?</font>

In [7]:
def load(path):
    return pd.read_csv(path, index_col=0)    

#### <font color="red">How to write the data?</font>

In [8]:
def write(df, path):
    df.to_csv(path)

### Provide a simple, easy-to-understand data interface
... by glueing above elements together

In [9]:
from collections.abc import MutableMapping

In [10]:
class DataMapping(MutableMapping):
    """
    Docstring
    """
    def __init__(self, path_mapping, load, write):
        self.path_mapping = path_mapping
        self.load = load
        self.write = write
    def __getitem__(self, key):
        return self.load(self.path_mapping[key]['in'])
    def __setitem__(self, key, value):
        self.write(value, self.path_mapping[key]['out'])
    def __len__(self):
        return len(self.path_mapping)
    def __iter__(self):
        # For mappings, it should iterate over the keys of the container.
        return iter(self.path_mapping)
    def __delitem__(self):
        pass
    def __repr__(self):
        return f'{type(self).__name__}({self.path_mapping})'

In [11]:
dm = DataMapping(path_mapping, load, write)
dm

DataMapping({'file2': {'in': './raw/file2.csv', 'out': './processed/file2.csv'}, 'file1': {'in': './raw/file1.csv', 'out': './processed/file1.csv'}})

In [12]:
list(dm.keys())

['file2', 'file1']

In [13]:
for key, df in dm.items():
    df_new = 42 * df
    dm[key] = df_new