# Import

In [15]:
import copy
import csv
import itertools
import json
import logging
import os
import shutil
import sys
from glob import glob
from typing import Dict, List, Tuple, Type

import boto3
import pandas as pd
from botocore.exceptions import ClientError
from sympy.combinatorics import Permutation

## Add configuration file

In [16]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [17]:
from ALL import config 
from util import *
# from MultilayerDict import *

In [18]:
s3_bucket_name =  "text-classification-nakayama-bucket"
object_name = "RawData/reuters/"
root_path = "/home/jovyan"

In [19]:
d = {"temporary": {"A": "doc", "B": "tes"}}

In [20]:
file_path = "/home/jovyan/temporary/test/0/0/test.csv"
dir_path = "../../temporary/test"

In [21]:
encoding="utf8"
newline=""
delimiter=","
quotechar="|"

In [22]:
def _make_test(
    file_data: pd.DataFrame,
    file_num: int = 2,
    file_depth: int = 2,
    extension: str = "csv",
):
    _file_nums = [str(_file_num) for _file_num in range(file_num)]

    for _file_name in itertools.product(_file_nums, repeat=file_depth):
        _file_path = f"{root_path_temporary}/test/{'/'.join(_file_name)}/test.{extension}"
        file_data.to_csv(make_filepath(_file_path), index=False)

In [23]:
test_df = pd.DataFrame([[1, 2, 3, 4 ], [2, 3, 4, 5]])

In [24]:
_make_test(test_df, file_depth=2)

In [25]:
field_names= ["a", "b", "c"]
index=["dim1", "dim2", "dim3", "dim4"]

In [81]:
names_keys = {"col": field_names, "ind": index, "ind2": [1, 2, 3]}

In [110]:
class MultilayerDict:
    """
    MultilayerDict
    """

    names: List
    names_keys: Dict
    dictionary: Dict

    def __init__(
        self,
        names_keys: Dict | None = None,
        dictionary: Dict | None = None,
        dir_path: str | None = None,
        names: List | None = None,
        extension: str = "csv",
        basis_depth: str = "max",
    ):
        if names_keys is not None:
            self.names = list(names_keys.keys())
            self.names_keys = names_keys
            if dictionary is not None:
                self.dictionary = dictionary
            else:
                self.dictionary = self.make_multilayer_dict(list(names_keys.values()))
        elif dir_path is not None:
            self.dictionary, _keys, self.names = self.read_dirs(
                dir_path,
                names=names,
            )
        else:
            raise ValueError("Both names_keys and dir_path are None.")

    def __check_all_none(self, *args) -> bool:
        if all(_v is None for _v in args):
            return True
        else:
            return False

    def __check_one_not_none(self, *args) -> bool:
        if sum(1 for arg in args if arg is not None) == 1:
            return True
        else:
            return False

    def make_multilayer_dict(self, keys: List):
        def _multilayer_dict_recursive(_d: Dict, _keys: List):
            if not _keys:
                return _d, []
            else:
                return _multilayer_dict_recursive(
                    {_key: copy.deepcopy(_d) for _key in _keys[-1]}, _keys[:-1]
                )

        _multilayer_dict, _ = _multilayer_dict_recursive(dict(), keys)
        return _multilayer_dict

    def name_is_in(self, name) -> bool:
        return name in self.name

    def keys_exist(self, keys: List | Tuple) -> bool:
        def __is_accessible(_d: Dict, _t: List | Tuple) -> bool:
            if len(_t) == 1:
                return t[0] in d
            if _t[0] in _d:
                return __is_accessible(_d[_t[0]], _t[1:])
            else:
                return False

        return __is_accessible(self.dictionary, keys)

    def loc(self, key_list: List):
        def _loc_recursive(_val, _key_list: list):
            if not _key_list:
                return _val
            else:
                return _loc_recursive(_val[_key_list[0]], _key_list[1:])

        return _loc_recursive(self.dictionary, key_list)

    def update(self, key_list: List, val) -> None:
        _d = self.dictionary
        for key in key_list[:-1]:
            _d = _d[key]
        _d[key_list[-1]] = val

    def read_dirs(self, dir_path, names=None, extension="csv", basis_depth="max"):
        def _one_dimensional_csv_to_dict(
            file_path, encoding="utf8", newline="", delimiter=",", quotechar="|"
        ):
            print(file_path)
            with open(file_path, mode="r", encoding=encoding, newline=newline) as _f:
                _reader = csv.reader(_f, delimiter=delimiter, quotechar=quotechar)
                _dict = dict(_reader)
            return _dict

        def _one_dimensional_json_to_dict(
            file_path,
            encoding="utf8",
            newline="",
        ):
            with open(file_path, mode="r", encoding=encoding, newline=newline) as _f:
                _dict = json.load(_f)
            return _dict

        def _get_uniform_dirs(__path_dirs, _basis_depth):
            # 最も深いパスを基準にMultilayerDictを生成する
            if _basis_depth == "max":
                _depth = max(map(len, __path_dirs.values()))
                _uniform_dirs = {
                    _path: _dir
                    for _path, _dir in __path_dirs.items()
                    if len(_dir) == _depth
                }
                return _depth, _uniform_dirs
            elif _basis_depth == "min":
                _depth = min(map(len, __path_dirs.values()))
                _uniform_dirs = {
                    _path: _dir
                    for _path, _dir in __path_dirs.items()
                    if len(_dir) == _depth
                }
                return _depth, _uniform_dirs
            elif isinstance(_basis_depth, int) & (_basis_depth > 0):
                _depth = _basis_depth
                _uniform_dirs = {
                    _path: _dir
                    for _path, _dir in __path_dirs.items()
                    if len(_dir) == _depth
                }
                return _depth, _uniform_dirs
            else:
                raise NotImplementedError

        # 読み込みディレクトリの中のファイルパスを探索し,ファイルパスからdictのkeyを生成
        _path_dirs = {}
        _abs_dir_path = os.path.abspath(dir_path)
        for _root, _, _file_names in os.walk(_abs_dir_path, followlinks=True):
            if _file_names:
                for _file_name in _file_names:
                    _file_path = f"{_root}/{_file_name}"
                    _file_name = os.path.splitext(os.path.basename(_file_name))[0]
                    _path_dirs[_file_path] = (
                        _file_path.replace(_abs_dir_path, "")
                    ).split("/")[1:]

        _depth, _uniform_path_dirs = _get_uniform_dirs(_path_dirs, basis_depth)

        # ファイルの読み込み
        _file_keys = set()
        if extension == "csv":
            for _path, _dir_list in _uniform_path_dirs.items():
                print(_path)
                print(_dir_list)
                _file_dict = _one_dimensional_csv_to_dict(_path)
                md.update(_dir_list, _file_dict)
                _file_keys = _file_keys & set(_file_dict)
        elif extension == "json":
            for _path, _dir_list in _uniform_path_dirs.items():
                _file_dict = _one_dimensional_json_to_dict(_path)
                md.update(_dir_list, _file_dict)
                _file_keys = _file_keys & set(_file_dict)
        else:
            raise NotImplementedError

        _keys = [list(set(_)) for _ in zip(*_uniform_path_dirs.values())]
        _keys.append(_file_keys)

        if names:
            _names = range(_depth + 1)
        else:
            if len(names) == _depth + 1:
                _names = names
            else:
                raise ValueError(
                    f"length mismatch. basis_depth is {basis_depth}, but length of name is {len(names)}."
                )
        _md = self.make_multilayer_dict(_names)
        return _md, _keys, _names

    def drop_names(
        self, names: List | None = None, loc: int | None = None, inplace: bool = False
    ):
        if self.__check_all_none(names, loc):
            raise ValueError("At least one argument must not be None.")

        if not self.__check_one_not_none(names, loc):
            raise ValueError("Multiple variables are specified.")

        if names is not None:
            if self.names[-len(names) :] != names:
                raise ValueError(
                    f"The given names ({names}) do not match the variable ({self.names})."
                )
            if inplace:
                self.names = [_ for _ in self.names[: -len(names)]]
                self.names_keys = {
                    _name: _val
                    for _name, _val in self.names_keys.items()
                    if _name in self.names
                }
            else:
                _names = [_ for _ in self.names[: -len(names)]]
                _names_keys = {
                    _name: _val
                    for _name, _val in self.names_keys.items()
                    if _name in _names
                }
                _dictionary = copy.deepcopy(self.dictionary)
                return MultilayerDict(_names_keys, _dictionary)

        elif loc is not None:
            if len(self.names_keys) < loc:
                raise ValueError(
                    f"Value of loc ({loc}) exceed length of names ({len(self.names_keys)})"
                )
            if inplace:
                self.names = [_ for _ in self.names[:-loc]]
                self.names_keys = {
                    _name: _val
                    for _name, _val in self.names_keys.items()
                    if _name in self.names
                }
            else:
                _names = [_ for _ in self.names[:-loc]]
                _names_keys = {
                    _name: _val
                    for _name, _val in self.names_keys.items()
                    if _name in _names
                }
                _dictionary = copy.deepcopy(self.dictionary)
                return MultilayerDict(_names_keys, _dictionary)

        else:
            raise NotImplementedError

        def add_names(self, names_keys: Dict, inplace:bool=False):
            _new_names_keys = dict(self.dictionary, names_keys)
            for _keys in itertools.product(_new_names_keys.values()):
            if self.keys_exist()

    def extend(self, keys: List[List[int]], mds: List, inplace: bool = False):
        if len(keys) != np.prod([len(_keys) for _keys in self.names_keys.values()]):
            raise ValueError(
                "The number of MultilayerDict and size of object do not match."
            )
        if len(keys) != len(mds):
            raise ValueError("number of keys and mds do not match.")

        if inplace:
            # dictionary
            for _keys, _md in zip(keys, mds):
                self.update(_keys, _md.dictionary)
            # namesを延長
            self.names.extend(mds[0].names)
            # keysを延長
            self.names_keys.update(mds[0].names_keys)
        else:
            md_return = copy.deepcopy(self)
            # dictionary
            for _keys, _md in zip(keys, mds):
                md_return.update(_keys, _md.dictionary)
            # namesを延長
            md_return.names.extend(mds[0].names)
            # keysを延長
            md_return.names_keys.update(mds[0].names_keys)
            return md_return

In [111]:
d = MultilayerDict(names_keys=names_keys)

In [112]:
d.names_keys

{'col': ['a', 'b', 'c'],
 'ind': ['dim1', 'dim2', 'dim3', 'dim4'],
 'ind2': [1, 2, 3]}

In [113]:
d.dictionary

{'a': {'dim1': {1: {}, 2: {}, 3: {}},
  'dim2': {1: {}, 2: {}, 3: {}},
  'dim3': {1: {}, 2: {}, 3: {}},
  'dim4': {1: {}, 2: {}, 3: {}}},
 'b': {'dim1': {1: {}, 2: {}, 3: {}},
  'dim2': {1: {}, 2: {}, 3: {}},
  'dim3': {1: {}, 2: {}, 3: {}},
  'dim4': {1: {}, 2: {}, 3: {}}},
 'c': {'dim1': {1: {}, 2: {}, 3: {}},
  'dim2': {1: {}, 2: {}, 3: {}},
  'dim3': {1: {}, 2: {}, 3: {}},
  'dim4': {1: {}, 2: {}, 3: {}}}}

In [117]:
for i in itertools.product(*d.names_keys.values()):
    print(i)
    d.update(i, 1)

('a', 'dim1', 1)
('a', 'dim1', 2)
('a', 'dim1', 3)
('a', 'dim2', 1)
('a', 'dim2', 2)
('a', 'dim2', 3)
('a', 'dim3', 1)
('a', 'dim3', 2)
('a', 'dim3', 3)
('a', 'dim4', 1)
('a', 'dim4', 2)
('a', 'dim4', 3)
('b', 'dim1', 1)
('b', 'dim1', 2)
('b', 'dim1', 3)
('b', 'dim2', 1)
('b', 'dim2', 2)
('b', 'dim2', 3)
('b', 'dim3', 1)
('b', 'dim3', 2)
('b', 'dim3', 3)
('b', 'dim4', 1)
('b', 'dim4', 2)
('b', 'dim4', 3)
('c', 'dim1', 1)
('c', 'dim1', 2)
('c', 'dim1', 3)
('c', 'dim2', 1)
('c', 'dim2', 2)
('c', 'dim2', 3)
('c', 'dim3', 1)
('c', 'dim3', 2)
('c', 'dim3', 3)
('c', 'dim4', 1)
('c', 'dim4', 2)
('c', 'dim4', 3)


In [118]:
d.dictionary

{'a': {'dim1': {1: 1, 2: 1, 3: 1},
  'dim2': {1: 1, 2: 1, 3: 1},
  'dim3': {1: 1, 2: 1, 3: 1},
  'dim4': {1: 1, 2: 1, 3: 1}},
 'b': {'dim1': {1: 1, 2: 1, 3: 1},
  'dim2': {1: 1, 2: 1, 3: 1},
  'dim3': {1: 1, 2: 1, 3: 1},
  'dim4': {1: 1, 2: 1, 3: 1}},
 'c': {'dim1': {1: 1, 2: 1, 3: 1},
  'dim2': {1: 1, 2: 1, 3: 1},
  'dim3': {1: 1, 2: 1, 3: 1},
  'dim4': {1: 1, 2: 1, 3: 1}}}

In [None]:
("a", "dim1", 1)

In [132]:
for keys in d.names_keys["col"]:
    for key in keys:
        df = pd.DataFrame(d.dictionary[key])

In [133]:
df

Unnamed: 0,dim1,dim2,dim3,dim4
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
