# Форматы данных (2)

Материалы:
* Макрушин С.В. "Лекция 5: Форматы данных (часть 2)"
* https://docs.python.org/3/library/csv.html
* https://docs.h5py.org/en/stable/
* Уэс Маккини. Python и анализ данных

In [220]:
import csv
import datetime
import json
import pprint
from collections import defaultdict
from operator import getitem
from pathlib import Path
from timeit import default_timer

import h5py
import numpy as np
import pandas as pd

In [221]:
def pprint_dict_of_lists(dct: dict, n=3, m=10):
    pprint.pprint({k: [*v[:m], '...'] for k, v in list(dct.items())[:n]}, sort_dicts=False)

In [222]:
DATA_DIR = Path('data/')
OUTPUT_DIR = DATA_DIR.joinpath('output/')

In [223]:
## Задачи для совместного разбора

1. Считайте данные из файла `open_pubs.csv`, используя `csv.reader`, и преобразуйте к структуре данных следующего вида:
    
`{'fas_id': [24, 30, ...], 'name': ['Achor Inn', 'Angel Inn', ...], ... }`


In [224]:
with open(DATA_DIR.joinpath('open_pubs.csv')) as f:
    reader = csv.reader(f)

    headers = next(reader)
    n = len(headers)
    data = {header: [] for header in headers}

    for row in reader:
        for i in range(n):
            data[headers[i]].append(row[i])

pprint_dict_of_lists(data, n=len(data), m=6)

{'fas_id': ['24', '30', '63', '64', '65', '85', '...'],
 'name': ['Anchor Inn',
          'Angel Inn',
          'Black Boy Hotel',
          'Black Horse',
          'Black Lion',
          'Bristol Arms',
          '...'],
 'address': ['Upper Street, Stratford St Mary, COLCHESTER, Essex',
             'Egremont Street, Glemsford, SUDBURY, Suffolk',
             '7 Market Hill, SUDBURY, Suffolk',
             'Lower Street, Stratford St Mary, COLCHESTER, Essex',
             'Lion Road, Glemsford, SUDBURY, Suffolk',
             'Bristol Hill, Shotley, IPSWICH, Suffolk',
             '...'],
 'postcode': ['CO7 6LW',
              'CO10 7SA',
              'CO10 2EA',
              'CO7 6JS',
              'CO10 7RF',
              'IP9 1PU',
              '...'],
 'easting': ['604748', '582888', '587356', '604270', '582750', '624667', '...'],
 'northing': ['234405',
              '247368',
              '241327',
              '233920',
              '248298',
              '233744',


2. Сгенерируйте 2 случайные матрицы размера 10_000 x 10_000 и вычислите их произведение. Сколько времени занимают три этих операции? Сохраните 3 полученных матрицы в файл .npz с соответствующими названиями

In [225]:
start = default_timer()

size = (3000, 3000)
matrix_0 = np.random.randint(0, 100, size=size, dtype=np.int8)
matrix_1 = np.random.randint(0, 100, size=size, dtype=np.int8)

print(f'time: {(default_timer() - start) * 1000:.4f} ms')

time: 99.5741 ms


In [226]:
start = default_timer()

matrix_dot = np.dot(matrix_0, matrix_1)

print(f'time: {(default_timer() - start) * 1000:.4f} ms')

time: 77165.3030 ms


In [227]:
print(matrix_0)
print(matrix_1)
print(matrix_dot)

[[75 43  5 ... 98 16 10]
 [20 53 65 ... 73 48 48]
 [96 69 40 ... 46 26 67]
 ...
 [41 17 10 ...  2 86 74]
 [27  8 96 ... 77 76 86]
 [41 82 32 ... 62 86 86]]
[[41 63 51 ... 85 79 27]
 [40 93  8 ... 53 73 46]
 [31 47 69 ... 53 13 99]
 ...
 [54 23 56 ... 10 57 68]
 [49 13  6 ... 65 95 87]
 [65 71 51 ... 53 39 60]]
[[  89 -120  -61 ...  -68 -100  -86]
 [ -80   62  -92 ...  -98   26  118]
 [  55  -49   12 ...    1  -32   37]
 ...
 [-124   -7  103 ...  103  -78  -70]
 [ -23  -63 -118 ... -104 -102  -37]
 [ -39  103   45 ...  -54 -125 -101]]


In [228]:
np.savez(
    OUTPUT_DIR.joinpath('matrices.npz'),
    matrix_0=matrix_0,
    matrix_1=matrix_1,
    matrix_dot=matrix_dot
)

3. Создайте 2 матрицы размера 1000x1000, используя различные параметризируемые распределения из numpy (https://docs.scipy.org/doc/numpy-1.15.0/reference/routines.random.html#distributions)

После этого сохраните получившиеся матрицы в hdf5-файл в виде двух различных датасетов. В качестве описания каждого датасета укажите параметры используемых распределений 

In [229]:
size = (1000, 1000)
matrix_0 = np.random.uniform(low=-20, high=50, size=size)
matrix_1 = np.random.normal(loc=10, scale=10, size=size)

In [230]:
with h5py.File(OUTPUT_DIR.joinpath('matrices.hdf5'), mode='w') as f:
    dset_0 = f.create_dataset('matrix_0', data=matrix_0)
    dset_0.attrs['description'] = ('Матрица, сгенерированная с помощью np.random.uniform(). '
                                   '(Draw samples from a uniform distribution)')
    dset_0.attrs['low'] = '-20'
    dset_0.attrs['high'] = '50'
    dset_0.attrs['size'] = f'{matrix_0.shape}'

    dset_1 = f.create_dataset('matrix_1', data=matrix_1)
    dset_1.attrs['description'] = ('Матрица, сгенерированная с помощью np.random.normal(). '
                                   '(Draw random samples from a normal (Gaussian) distribution)')
    dset_1.attrs['loc'] = '10'
    dset_1.attrs['scale'] = '10'
    dset_1.attrs['size'] = f'{matrix_1.shape}'

In [231]:
with h5py.File(OUTPUT_DIR.joinpath('matrices.hdf5')) as f:
    for k in f.keys():
        dset_1 = f[k]
        print(dset_1.attrs['description'])
        print(dset_1[:].shape)

Матрица, сгенерированная с помощью np.random.uniform(). (Draw samples from a uniform distribution)
(1000, 1000)
Матрица, сгенерированная с помощью np.random.normal(). (Draw random samples from a normal (Gaussian) distribution)
(1000, 1000)


## Лабораторная работа 5

### csv

1.1 В файле `tags_sample.csv` находится информация о тэгах, приписываемых рецептам. Воспользовавшись `csv.reader`, считайте этот файл и создайте словарь вида `id_рецепта: [список тэгов]`. Сохраните этот словарь в файл `tags_sample.json`.

In [232]:
with open(DATA_DIR.joinpath('tags_sample.csv')) as f:
    reader = csv.reader(f)

    headers = next(reader)
    n = len(headers)

    tags_sample = defaultdict(list)
    for row in reader:
        tags_sample[int(row[0])].append(row[1])

In [233]:
with open(OUTPUT_DIR.joinpath('tags_sample.json'), mode='w', encoding='utf-8') as f:
    json.dump(tags_sample, f, indent=4)

1.2 Считайте файл `recipes_sample_with_filled_nsteps.csv` (__ЛР4__) в виде `pd.DataFrame`. Добавьте к таблице 2 столбца: `n_tags`, содержащий количество тэгов у этого рецепта; и `tags`, содержащий набор тэгов в виде строки (тэги внутри строки разделяются символом `;`)

In [235]:
recipes_df = pd.read_csv(
    DATA_DIR.joinpath('recipes_sample_with_filled_nsteps.csv'),
    sep=',',
    index_col=0,
    parse_dates=['submitted']
)

In [236]:
recipes_df

Unnamed: 0,name,id,minutes,contributor_id,submitted,n_steps,description,n_ingredients
0,george s at the cove black bean soup,44123,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,18.0
1,healthy for them yogurt popsicles,67664,10,91970,2003-07-26,3,my children and their friends ask for my homem...,
2,i can t believe it s spinach,38798,30,1533,2002-08-29,5,"these were so go, it surprised even me.",8.0
3,italian gut busters,35173,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,
4,love is in the air beef fondue sauces,84797,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,
...,...,...,...,...,...,...,...,...
29995,zurie s holey rustic olive and cheddar bread,267661,80,200862,2007-11-25,16,this is based on a french recipe but i changed...,10.0
29996,zwetschgenkuchen bavarian plum cake,386977,240,177443,2009-08-24,22,"this is a traditional fresh plum cake, thought...",11.0
29997,zwiebelkuchen southwest german onion cake,103312,75,161745,2004-11-03,10,this is a traditional late summer early fall s...,
29998,zydeco soup,486161,60,227978,2012-08-29,7,this is a delicious soup that i originally fou...,


In [237]:
n_tags_tags = {'id': [], 'n_tags': [], 'tags': []}
for k, v in tags_sample.items():
    n_tags_tags['id'].append(k)
    n_tags_tags['n_tags'].append(len(v))
    n_tags_tags['tags'].append(';'.join(v))

n_tags_tags_df = pd.DataFrame(n_tags_tags)

In [238]:
n_tags_tags_df

Unnamed: 0,id,n_tags,tags
0,44123,25,weeknight;time-to-make;course;main-ingredient;...
1,67664,31,15-minutes-or-less;time-to-make;course;prepara...
2,38798,17,30-minutes-or-less;time-to-make;course;main-in...
3,35173,11,60-minutes-or-less;time-to-make;course;prepara...
4,84797,19,30-minutes-or-less;time-to-make;course;main-in...
...,...,...,...
29995,267661,18,time-to-make;course;main-ingredient;cuisine;pr...
29996,386977,19,time-to-make;course;main-ingredient;cuisine;pr...
29997,103312,20,time-to-make;course;main-ingredient;cuisine;pr...
29998,486161,20,ham;60-minutes-or-less;time-to-make;course;mai...


In [239]:
recipes_tags_df = pd.merge(
    recipes_df,
    n_tags_tags_df,
    how='left',
    on='id'
)

In [240]:
recipes_tags_df

Unnamed: 0,name,id,minutes,contributor_id,submitted,n_steps,description,n_ingredients,n_tags,tags
0,george s at the cove black bean soup,44123,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,18.0,25,weeknight;time-to-make;course;main-ingredient;...
1,healthy for them yogurt popsicles,67664,10,91970,2003-07-26,3,my children and their friends ask for my homem...,,31,15-minutes-or-less;time-to-make;course;prepara...
2,i can t believe it s spinach,38798,30,1533,2002-08-29,5,"these were so go, it surprised even me.",8.0,17,30-minutes-or-less;time-to-make;course;main-in...
3,italian gut busters,35173,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,,11,60-minutes-or-less;time-to-make;course;prepara...
4,love is in the air beef fondue sauces,84797,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,,19,30-minutes-or-less;time-to-make;course;main-in...
...,...,...,...,...,...,...,...,...,...,...
29995,zurie s holey rustic olive and cheddar bread,267661,80,200862,2007-11-25,16,this is based on a french recipe but i changed...,10.0,18,time-to-make;course;main-ingredient;cuisine;pr...
29996,zwetschgenkuchen bavarian plum cake,386977,240,177443,2009-08-24,22,"this is a traditional fresh plum cake, thought...",11.0,19,time-to-make;course;main-ingredient;cuisine;pr...
29997,zwiebelkuchen southwest german onion cake,103312,75,161745,2004-11-03,10,this is a traditional late summer early fall s...,,20,time-to-make;course;main-ingredient;cuisine;pr...
29998,zydeco soup,486161,60,227978,2012-08-29,7,this is a delicious soup that i originally fou...,,20,ham;60-minutes-or-less;time-to-make;course;mai...


1.3 В файле `ingredients_sample.csv` находится информация о ингредиентах, необходимых для рецепта. Воспользовавшись `csv.DictReader`, считайте этот файл и создайте словарь вида `id_рецепта: [список ингредиентов]`.

In [241]:
with open(DATA_DIR.joinpath('ingredients_sample.csv')) as f:
    reader = csv.DictReader(f)
    ingredients_sample = defaultdict(list)
    for row in reader:
        ingredients_sample[int(row['recipe_id'])].append(row['ingredient'])

In [242]:
pprint_dict_of_lists(ingredients_sample)

{44123: ['unsalted butter',
         'carrot',
         'onion',
         'celery',
         'broccoli stem',
         'dried thyme',
         'dried oregano',
         'dried sweet basil leaves',
         'dry white wine',
         'chicken stock',
         '...'],
 250900: ['unsalted butter',
          'all-purpose flour',
          'walnuts',
          'light brown sugar',
          'refrigerated pie crust',
          'granny smith apples',
          '...'],
 120462: ['unsalted butter',
          'onion',
          'milk',
          'salt',
          'egg',
          'cream cheese',
          'extra-sharp cheddar cheese',
          'fresh ground black pepper',
          'garlic clove',
          'penne pasta',
          '...']}


1.4 Добавьте к таблице из задания 1.2 столбец `ingredients`, содержащий набор ингредиентов в виде строки (ингредиенты внутри строки разделяются символом `*`)

Для строк, которые содержат пропуски в столбце `n_ingredients`, заполните их на основе файла  `ingredients_sample.csv`

In [243]:
n_ingredients_ingredients = {'id': [], 'n_ingredients': [], 'ingredients': []}
for k, v in ingredients_sample.items():
    n_ingredients_ingredients['id'].append(k)
    n_ingredients_ingredients['n_ingredients'].append(len(v))
    n_ingredients_ingredients['ingredients'].append('*'.join(v))

n_ingredients_ingredients_df = pd.DataFrame(n_ingredients_ingredients)

In [244]:
n_ingredients_ingredients_df

Unnamed: 0,id,n_ingredients,ingredients
0,44123,18,unsalted butter*carrot*onion*celery*broccoli s...
1,250900,6,unsalted butter*all-purpose flour*walnuts*ligh...
2,120462,14,unsalted butter*onion*milk*salt*egg*cream chee...
3,257111,8,unsalted butter*milk*eggs*honey*white bread*va...
4,148114,4,unsalted butter*nuts*granulated sugar*semi-swe...
...,...,...,...
29995,290445,2,mountain dew soda*irish whiskey
29996,53234,2,venison roast*clean jars
29997,451071,2,pink lemonade*watermelon vodka
29998,166778,2,peach gelatin*iced tea


In [245]:
recipes_tags_ingredients_df = pd.merge(
    recipes_tags_df,
    n_ingredients_ingredients_df,
    how='left',
    on='id',
    suffixes=('', '_right')
)

In [246]:
recipes_tags_ingredients_df

Unnamed: 0,name,id,minutes,contributor_id,submitted,n_steps,description,n_ingredients,n_tags,tags,n_ingredients_right,ingredients
0,george s at the cove black bean soup,44123,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,18.0,25,weeknight;time-to-make;course;main-ingredient;...,18,unsalted butter*carrot*onion*celery*broccoli s...
1,healthy for them yogurt popsicles,67664,10,91970,2003-07-26,3,my children and their friends ask for my homem...,,31,15-minutes-or-less;time-to-make;course;prepara...,3,milk*frozen juice concentrate*plain yogurt
2,i can t believe it s spinach,38798,30,1533,2002-08-29,5,"these were so go, it surprised even me.",8.0,17,30-minutes-or-less;time-to-make;course;main-in...,8,onion*frozen chopped spinach*eggs*garlic powde...
3,italian gut busters,35173,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,,11,60-minutes-or-less;time-to-make;course;prepara...,9,sandwich bun*good seasonings italian salad dre...
4,love is in the air beef fondue sauces,84797,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,,19,30-minutes-or-less;time-to-make;course;main-in...,12,beef steaks*vegetable oil*spicy mustard*fresh ...
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,zurie s holey rustic olive and cheddar bread,267661,80,200862,2007-11-25,16,this is based on a french recipe but i changed...,10.0,18,time-to-make;course;main-ingredient;cuisine;pr...,10,dry white wine*eggs*cheddar cheese*baking powd...
29996,zwetschgenkuchen bavarian plum cake,386977,240,177443,2009-08-24,22,"this is a traditional fresh plum cake, thought...",11.0,19,time-to-make;course;main-ingredient;cuisine;pr...,11,unsalted butter*milk*flour*salt*vanilla*all-pu...
29997,zwiebelkuchen southwest german onion cake,103312,75,161745,2004-11-03,10,this is a traditional late summer early fall s...,,20,time-to-make;course;main-ingredient;cuisine;pr...,13,onion*milk*eggs*butter*flour*salt*pepper*sugar...
29998,zydeco soup,486161,60,227978,2012-08-29,7,this is a delicious soup that i originally fou...,,20,ham;60-minutes-or-less;time-to-make;course;mai...,22,onion*celery*dried thyme*dried oregano*fresh p...


In [247]:
mask = recipes_tags_ingredients_df['n_ingredients'].isna()
loc_left = (mask, 'n_ingredients')
loc_right = (mask, 'n_ingredients_right')
recipes_tags_ingredients_df.loc[loc_left] = recipes_tags_ingredients_df.loc[loc_right]
recipes_tags_ingredients_df = recipes_tags_ingredients_df.drop(columns=['n_ingredients_right'])

# меняем порядок колонок
columns = recipes_tags_ingredients_df.columns
order = [columns[1], columns[0], *columns[2:7], *columns[8:10], columns[7], columns[10]]
recipes_tags_ingredients_df = recipes_tags_ingredients_df[order]

In [248]:
recipes_tags_ingredients_df

Unnamed: 0,id,name,minutes,contributor_id,submitted,n_steps,description,n_tags,tags,n_ingredients,ingredients
0,44123,george s at the cove black bean soup,90,35193,2002-10-25,11,an original recipe created by chef scott meska...,25,weeknight;time-to-make;course;main-ingredient;...,18.0,unsalted butter*carrot*onion*celery*broccoli s...
1,67664,healthy for them yogurt popsicles,10,91970,2003-07-26,3,my children and their friends ask for my homem...,31,15-minutes-or-less;time-to-make;course;prepara...,3.0,milk*frozen juice concentrate*plain yogurt
2,38798,i can t believe it s spinach,30,1533,2002-08-29,5,"these were so go, it surprised even me.",17,30-minutes-or-less;time-to-make;course;main-in...,8.0,onion*frozen chopped spinach*eggs*garlic powde...
3,35173,italian gut busters,45,22724,2002-07-27,7,my sister-in-law made these for us at a family...,11,60-minutes-or-less;time-to-make;course;prepara...,9.0,sandwich bun*good seasonings italian salad dre...
4,84797,love is in the air beef fondue sauces,25,4470,2004-02-23,4,i think a fondue is a very romantic casual din...,19,30-minutes-or-less;time-to-make;course;main-in...,12.0,beef steaks*vegetable oil*spicy mustard*fresh ...
...,...,...,...,...,...,...,...,...,...,...,...
29995,267661,zurie s holey rustic olive and cheddar bread,80,200862,2007-11-25,16,this is based on a french recipe but i changed...,18,time-to-make;course;main-ingredient;cuisine;pr...,10.0,dry white wine*eggs*cheddar cheese*baking powd...
29996,386977,zwetschgenkuchen bavarian plum cake,240,177443,2009-08-24,22,"this is a traditional fresh plum cake, thought...",19,time-to-make;course;main-ingredient;cuisine;pr...,11.0,unsalted butter*milk*flour*salt*vanilla*all-pu...
29997,103312,zwiebelkuchen southwest german onion cake,75,161745,2004-11-03,10,this is a traditional late summer early fall s...,20,time-to-make;course;main-ingredient;cuisine;pr...,13.0,onion*milk*eggs*butter*flour*salt*pepper*sugar...
29998,486161,zydeco soup,60,227978,2012-08-29,7,this is a delicious soup that i originally fou...,20,ham;60-minutes-or-less;time-to-make;course;mai...,22.0,onion*celery*dried thyme*dried oregano*fresh p...


1.5 Проверьте, содержит ли столбец `n_ingredients` пропуски. Если нет, преобразуйте его к целочисленному типу и сохраните результаты в файл `recipes_sample_with_tags_ingredients.csv`

In [249]:
if recipes_tags_ingredients_df['n_ingredients'].isna().sum() == 0:
    recipes_tags_ingredients_df['n_ingredients'] = recipes_tags_ingredients_df['n_ingredients'].astype(int)
    recipes_tags_ingredients_df.to_csv(
        OUTPUT_DIR.joinpath('recipes_sample_with_tags_ingredients.csv'),
        sep=',',
        index=False
    )

### npy

2.1 Разделите таблицу, полученную в результате 1.5, на две таблицы: одна содержит рецепты, загруженные до 2000 года; вторая - все остальные. В полученных таблицах оставьте только числовые столбцы и преобразуйте их к `numpy.array`

In [250]:
mask = recipes_tags_ingredients_df['submitted'] < datetime.datetime(2000, 1, 1)

numeric_df = recipes_tags_ingredients_df.select_dtypes(include=[int, float])

before_2000_arr = numeric_df[mask].values
after_2000_arr = numeric_df[~mask].values

In [251]:
print(before_2000_arr.shape)
before_2000_arr

(275, 6)


array([[  3441,     30,   1562,      8,     10,      8],
       [  4205,     25,   1617,      3,     14,      5],
       [  3258,      0,   1534,      8,     20,      6],
       ...,
       [  3752,      0,   1535,     13,      9,      4],
       [  4801,     20,   1598,      4,     18,      7],
       [  2982,      0, 124030,      6,     13,      7]], dtype=int64)

In [252]:
print(after_2000_arr.shape)
after_2000_arr

(29725, 6)


array([[ 44123,     90,  35193,     11,     25,     18],
       [ 67664,     10,  91970,      3,     31,      3],
       [ 38798,     30,   1533,      5,     17,      8],
       ...,
       [103312,     75, 161745,     10,     20,     13],
       [486161,     60, 227978,      7,     20,     22],
       [298512,     29, 506822,      9,     12,     10]], dtype=int64)

2.2. Сохраните 2 полученных массива в архив `npz`. Дайте массивам читаемые имена.

In [253]:
np.savez(
    OUTPUT_DIR.joinpath('numerics_arrays.npz'),
    before_2000_arr=before_2000_arr,
    after_2000_arr=after_2000_arr
)

2.3 Считайте созданный архив и продемонстрируйте, что данные считались корректно. 

In [254]:
with np.load(OUTPUT_DIR.joinpath('numerics_arrays.npz')) as f:
    for k in f.keys():
        arr = f[k]
        print(f'{k}: {arr.shape}')
        print(arr, '\n')

before_2000_arr: (275, 6)
[[  3441     30   1562      8     10      8]
 [  4205     25   1617      3     14      5]
 [  3258      0   1534      8     20      6]
 ...
 [  3752      0   1535     13      9      4]
 [  4801     20   1598      4     18      7]
 [  2982      0 124030      6     13      7]] 

after_2000_arr: (29725, 6)
[[ 44123     90  35193     11     25     18]
 [ 67664     10  91970      3     31      3]
 [ 38798     30   1533      5     17      8]
 ...
 [103312     75 161745     10     20     13]
 [486161     60 227978      7     20     22]
 [298512     29 506822      9     12     10]] 



### hdf

3.1 Выведите названия всех датасетов, находящихся в файле `nutrition_sample.h5`, а также размерность матриц, содержащихся в данных датасетах и их метаданные.

Формат вывода:
```
Dataset name=dataset_0, dataset size=(30000,), metadata={'info': 'calories (#)'}
Dataset name=dataset_1, dataset size=(30000,), metadata={'info': 'total fat (PDV)'}
...
```

In [255]:
dsets = {}
with h5py.File(DATA_DIR.joinpath('nutrition_sample.h5')) as f:
    for name in f.keys():
        dset = f[name]
        size = dset.shape
        metadata = dict(dset.attrs)
        print(f'Dataset {name=}, dataset {size=}, {metadata=}')
        dsets[name] = {'data': dset[:], 'metadata': metadata}

Dataset name='dataset_0', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'calories (#)'}
Dataset name='dataset_1', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'total fat (PDV)'}
Dataset name='dataset_2', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'sugar (PDV)'}
Dataset name='dataset_3', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'sodium (PDV)'}
Dataset name='dataset_4', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'protein (PDV)'}
Dataset name='dataset_5', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'saturated fat (PDV)'}
Dataset name='dataset_6', dataset size=(30000, 2), metadata={'col_0': 'recipe_id', 'col_1': 'carbohydrates (PDV)'}


3.2 Разбейте каждый из имеющихся датасетов на две части: 1 часть содержит только те строки, где PDV (Percent Daily Value) превышает 100%; 2 часть содержит те строки, где PDV не составляет не более 100%. Создайте 2 группы в файле и разместите в них соответствующие части датасета c сохранением метаданных исходных датасетов. Итого должно получиться 2 группы, содержащие несколько датасетов. Сохраните результаты в файл `nutrition_grouped.h5`

In [256]:
groups = [
    {
        'group': 'PDV_gt_100',
        'mask': lambda x: getitem(x, (slice(x.shape[0]), 1)) > 100
    },
    {
        'group': 'PDV_lte_100',
        'mask': lambda x: getitem(x, (slice(x.shape[0]), 1)) <= 100
    }
]

grouped_dsets = {}
for group in groups:
    group_name = group['group']
    grouped_dsets[group_name] = []
    for dset_name, dset in dsets.items():
        metadata = dset['metadata']
        if 'PDV' in metadata['col_1']:
            data = dset['data']
            data = data[group['mask'](data)]
            grouped_dsets[group_name].append({'name': dset_name, 'data': data, 'metadata': metadata})

grouped_dsets

{'PDV_gt_100': [{'name': 'dataset_1',
   'data': array([[4.41230e+04, 1.08000e+02],
          [8.47970e+04, 1.54000e+02],
          [5.06620e+04, 3.25000e+02],
          ...,
          [8.00670e+04, 1.29000e+02],
          [7.14500e+04, 2.37000e+02],
          [2.67661e+05, 2.42000e+02]]),
   'metadata': {'col_0': 'recipe_id', 'col_1': 'total fat (PDV)'}},
  {'name': 'dataset_2',
   'data': array([[8.47970e+04, 3.23000e+02],
          [4.53467e+05, 2.11000e+02],
          [1.18843e+05, 1.63000e+03],
          ...,
          [7.14500e+04, 1.11800e+03],
          [3.16950e+05, 2.84000e+02],
          [3.86977e+05, 1.22000e+02]]),
   'metadata': {'col_0': 'recipe_id', 'col_1': 'sugar (PDV)'}},
  {'name': 'dataset_3',
   'data': array([[1.18843e+05, 1.59000e+02],
          [3.06590e+05, 1.46000e+02],
          [1.34787e+05, 2.44400e+03],
          ...,
          [3.16950e+05, 1.94000e+02],
          [4.64576e+05, 1.20000e+02],
          [2.67661e+05, 2.09000e+02]]),
   'metadata': {'col_0'

In [257]:
with h5py.File(OUTPUT_DIR.joinpath('nutrition_grouped.h5'), mode='w') as f:
    for group, dsets in grouped_dsets.items():
        group_h5 = f.create_group(group)
        for dset in dsets:
            dset_h5 = group_h5.create_dataset(dset['name'], data=dset['data'])
            dset_h5.attrs.update(dset['metadata'])

3.3 Выведите названия всех групп и датасетов, находящихся в этих группах, из файла `nutrition_grouped.h5` а также размерность матриц, содержащихся в датасетах и их метаданные.

In [258]:
with h5py.File(OUTPUT_DIR.joinpath('nutrition_grouped.h5')) as f:
    for group_name, group in f.items():
        print(f'Group: {group_name}')
        for name in group:
            dset = group[name]
            size = dset.shape
            metadata = dict(dset.attrs)
            print(f'- dataset {name=}, dataset {size=}, {metadata=}')
        print('\n', end='')

Group: PDV_gt_100
- dataset name='dataset_1', dataset size=(1736, 2), metadata={'col_0': 'recipe_id', 'col_1': 'total fat (PDV)'}
- dataset name='dataset_2', dataset size=(5316, 2), metadata={'col_0': 'recipe_id', 'col_1': 'sugar (PDV)'}
- dataset name='dataset_3', dataset size=(1244, 2), metadata={'col_0': 'recipe_id', 'col_1': 'sodium (PDV)'}
- dataset name='dataset_4', dataset size=(1776, 2), metadata={'col_0': 'recipe_id', 'col_1': 'protein (PDV)'}
- dataset name='dataset_5', dataset size=(2858, 2), metadata={'col_0': 'recipe_id', 'col_1': 'saturated fat (PDV)'}
- dataset name='dataset_6', dataset size=(642, 2), metadata={'col_0': 'recipe_id', 'col_1': 'carbohydrates (PDV)'}

Group: PDV_lte_100
- dataset name='dataset_1', dataset size=(28264, 2), metadata={'col_0': 'recipe_id', 'col_1': 'total fat (PDV)'}
- dataset name='dataset_2', dataset size=(24684, 2), metadata={'col_0': 'recipe_id', 'col_1': 'sugar (PDV)'}
- dataset name='dataset_3', dataset size=(28756, 2), metadata={'col_0'

3.4 Модифицируйте код из 3.2 таким образом, чтобы сохранить датасеты, используя сжатие. Сравните размер полученного файла с размерами файла из 3.2. Прокомментируйте результат.

In [259]:
with h5py.File(OUTPUT_DIR.joinpath('nutrition_grouped.gzip.h5'), mode='w') as f:
    for group, dsets in grouped_dsets.items():
        group_h5 = f.create_group(group)
        for dset in dsets:
            dset_h5 = group_h5.create_dataset(
                dset['name'],
                data=dset['data'],
                compression='gzip',
                compression_opts=8
            )
            dset_h5.attrs.update(dset['metadata'])