# file structure

> How to store predefined file structure of objects and check existence


In [None]:
# | default_exp core.files.structure

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# | export
# | hide

from __future__ import annotations

import json
import os
import re

from pathlib import Path
from typing import Literal

import pandas as pd
from dotenv import load_dotenv

from ds_contrib.core.utils import Iterifiable, listify

In [None]:
# | hide

from ds_contrib.tools.io.gscloud import GSBrowser
from ds_contrib.core.paths import PathLike

### File Structure node

> Building file structure from a json file with structure and list of paths


In [None]:
# | hide

CWD = Path.cwd()
REPO_DIR = Path(*CWD.parts[: CWD.parts.index("ds_contrib") + 1])
CONFIGS_DIR = REPO_DIR / "configs"
ENV_DIR = CONFIGS_DIR / "env/local"

with open(CONFIGS_DIR / "storage/gscloud/projects_vars.json") as f:
    projects = json.load(f)

# choose project
project = projects["dev"]
env_path = Path(ENV_DIR / f'{project["env"]}_roadly.env')

_ = load_dotenv(env_path)  # read local .env file
google_app_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
roadly_cookie = os.getenv("ROADLY_COOKIE")
print(f"Initial configuration has finished:\nProject: {project}")

Initial configuration has finished:
Project: {'project': 'roadly-project-dev', 'env': 'dev', 'coldline_name': 'standard'}


In [None]:
# | hide

RECSLAM_DATA_PATH = "roadly-dev-standard-videos/Antalya/2022-02-21_13-18-38_4453/"

with open(CONFIGS_DIR / "storage/recslam/recslam_structure.json", "r") as f:
    RECSLAM_STRUCTURE = json.load(f)

In [None]:
# | export
# | hide


class FSNode:
    def __init__(
        self,
        name: str | None = None,
        children: dict[str, FSNode] | Literal["pattern"] | None = None,
        path: str | None = None,
        description: str | None = None,
        exists: bool | None = None,
        type: Literal["file", "group", "pattern"] | None = None,
        **kwargs,
    ):
        """File structure node class.

        Used to represent a file structure in a hierarchical way. Each node can be a file, a group of files (logical entity) or a pattern for searching files.

        Parameters
        ----------
        name : str | None, optional
            name of the node, by default None
        children : dict[str, FSNode] | Literal[&quot;pattern&quot;] | None, optional
            child nodes, by default None
        path : str | None, optional
            path of the node from the root of the file structure, by default None
        description : str | None, optional
            description of the node, by default None
        exists : bool | None, optional
            whether the node indicated in the file structure exists in a parsed files (look at `parse_files` method),
            None means that the node was not parsed yet, by default None
        type : Literal[&quot;file&quot;, &quot;group&quot;, &quot;pattern&quot;] | None, optional
            type of the node, types are: `file`, `group`, `pattern`, by default None
        """
        self.name = name
        self._children: dict[str, FSNode] | Literal["pattern"] | None = children
        self.path = path
        self.description = description
        self.exists: bool | None = exists
        self.type: Literal["file", "group", "pattern"] | None = type
        self._kwargs = kwargs

    @property
    def is_file(self):
        return self.type == "file"

    def parse_files(self, paths: Iterifiable[PathLike]) -> set[PathLike]:
        """Main method for parsing a list of files and marking the nodes that exist in the file structure.

        The method is recursive and parses all children nodes.
        WARNING: some nodes may not be parsed if they do not exist in the file structure,
        some nodes' contents may be changed e.g. for `pattern` nodes.

        Parameters
        ----------
        paths : Iterifiable[PathLike]
            iterable of paths to parse or a single path

        Returns
        -------
        set[PathLike]
            set of paths that were not parsed due to the fact that they do not exist in the file structure

        Raises
        ------
        ValueError
            If the node type is not one of `file`, `group`, `pattern`.
        """
        remained_paths = set(listify(paths, none_handlings="empty"))
        # first parse all files
        if self.type == "file":
            if self.path in remained_paths:
                self.exists = True
                remained_paths.remove(self.path)
            else:
                self.exists = False
        elif self.type == "group":
            for child in self._children.values():
                remained_paths = child.parse_files(remained_paths)
        elif self.type == "pattern":
            deleted_paths = []
            for path in remained_paths:
                if re.match(self.path, path):
                    name = Path(path).name
                    self._children[name] = self.__class__(
                        name,
                        path=path,
                        type="file",
                        description=self._kwargs["file_description"],
                        exists=True,
                    )
                    deleted_paths.append(path)
            for path in deleted_paths:
                remained_paths.remove(path)
        else:
            raise ValueError(f"Unknown node type {self.type}")
        return remained_paths

    @classmethod
    def _parse_group(cls, d: dict, parent_node: FSNode):
        parent_node.description = d.get("desc")
        parent_node.path = d.get("path")
        parent_node.type = "group"
        parent_node._children = {
            k: cls._parse_dict(v, cls(name=k)) for k, v in d["nodes"].items()
        }
        return parent_node

    @classmethod
    def _parse_pattern(cls, d: dict, parent_node: FSNode):
        parent_node.type = "pattern"
        parent_node.description = d.get("desc")
        parent_node._kwargs["file_description"] = d["path_pattern"].get("desc")
        parent_node._children = {}
        parent_node.path = d["path_pattern"]["path"]
        return parent_node

    @staticmethod
    def _parse_file_node(d: dict, parent_node: FSNode):
        parent_node.description = d.get("desc")
        parent_node.path = d.get("path")
        parent_node.type = "file"
        parent_node._children = None
        return parent_node

    def _get_files(self, files: list[FSNode] = None, only_exists: bool = False):
        if self.type == "file":
            if not only_exists or self.exists:
                files.append(self)
        elif self.type in ("group", "pattern"):
            for child in self._children.values():
                child._get_files(files, only_exists)
        else:
            raise ValueError(f"Unknown node type {self.type}")

    def get_files(self, only_exists: bool = True) -> FSNode:
        """Get all files in the file structure as a `FSNode` objects under root `FSNode` object.

        Parameters
        ----------
        only_exists : bool, optional
            whether to include only existing files, that are marked as existing in the file structure after parsing, by default True
            WARNING: this method works only after parsing files with `parse_files` method

        Returns
        -------
        FSNode
            root `FSNode` object with all files in the flat file structure as children
        """
        files = []
        self._get_files(files, only_exists)
        root_node = FSNode(
            name="files",
            description=f"All files in file structure {'excluding' if only_exists else 'including'} non existing files",
            type="group",
        )
        root_node._children = {f.name: f for f in files}
        # TODO[HIGH]: Add caching of processed files
        return root_node

    @classmethod
    def _parse_dict(cls, d: dict, parent_node: FSNode):
        if "path_pattern" in d:
            return cls._parse_pattern(d, parent_node)
        elif "nodes" in d:
            return cls._parse_group(d, parent_node)
        else:
            return cls._parse_file_node(d, parent_node)

    @classmethod
    def from_dict(cls, d: dict) -> FSNode:
        """Generate a `FSNode` with hierarchical structure from a dictionary.

        Parameters
        ----------
        d : dict
            dictionary with hierarchical structure of the file structure, e.g. as in `recslam_structure.json`

        Returns
        -------
        FSNode
            root `FSNode` object with hierarchical structure of the file structure
        """
        root_node = FSNode(name="root_node", description="root_node", type="group")
        return cls._parse_dict(d, root_node)

    def _to_dict(self):
        return {
            "path": self.path,
            "description": self.description,
            "type": self.type,
            "exists": self.exists,
        }

    def __repr__(self):
        s = f"{self.__class__.__name__}: [{self.name}]\n"
        if self._children:
            s += repr({k: v._to_dict() for k, v in self._children.items()})
        return s

    @property
    def df(self) -> pd.DataFrame:
        """Return a `pd.DataFrame` with all nodes in the file structure as rows.

        Returns
        -------
        pd.DataFrame
        """
        if self._children:
            return pd.DataFrame.from_dict(
                {k: v._to_dict() for k, v in self._children.items()}, orient="index"
            )
        else:
            return pd.DataFrame.from_dict(self._to_dict(), orient="index").T

    def _repr_html_(self):
        s = f"<b>{self.__class__.__name__}: [{self.type}]{'(empty)' if self.type != 'file' and (self._children is None or len(self._children) == 0) else ''} {self.name}</b>: {self.description}<br><br>"
        s += self.df._repr_html_()
        return s

    def __getitem__(self, key: Iterifiable[str]) -> FSNode:
        """Get a child node by name hierarchically by using a list of keys

        Parameters
        ----------
        key : Iterifiable[str]
            complex key, e.g. ["group1", "file1"], may be a single key, e.g. "group1" or even `None`, which returns the root node

        Returns
        -------
        FSNode
            node with the given key

        Raises
        ------
        KeyError
            If the node with the given key does not exist.
        KeyError
            If the node has no children.
        """
        keys = listify(key, none_handlings="empty")
        if len(keys) == 0:
            return self
        elif len(keys) == 1:
            if self._children is None:
                raise KeyError(f"Node {self.name} has no children")
            else:
                return self._children[keys[0]]
        else:
            if self._children is None:
                raise KeyError(f"Node {self.name} has no children")
            else:
                return self._children[keys[0]][keys[1:]]

    def items(self):
        return self._children.items()

Getting a list of paths from Google Cloud Storage


In [None]:
browser = GSBrowser(project=project["project"], credentials=google_app_creds)
assert browser.is_present(RECSLAM_DATA_PATH), "Data is not present"

In [None]:
remote_files = [Path(p.path).name for p in browser.list(RECSLAM_DATA_PATH)["files"]]

Initialize file structure from a json file.


In [None]:
RECSLAM_STRUCTURE

{'desc': 'Recslam file structure',
 'nodes': {'common': {'desc': 'All the information about device, not specific to certain camera',
   'nodes': {'detections': {'desc': 'Detections of different distresses and objects in the frame processed by client device',
     'path': 'detections.json'},
    'device': {'desc': 'Device information, camera params, etc.',
     'path': 'device.txt'},
    'gps': {'desc': 'GPS information from the device', 'path': 'gps.csv'},
    'heading': {'desc': 'Heading (direction) information from the device',
     'path': 'heading.csv'},
    'motion': {'desc': 'Motion information from the device, e.g. acceleration, rotation, etc.',
     'path': 'motion.csv'},
    'snapshots': {'desc': 'Snapshots of the video for composition',
     'path': 'snapshots.zip'}}},
  'camera_wide': {'desc': 'Files related to wide camera (_2 suffix) - main camera with higher quality and lower fps',
   'nodes': {'video': {'desc': 'Video captured by the wide camera (main camera with higher q

In [None]:
fs = FSNode.from_dict(RECSLAM_STRUCTURE)

Parse real list of paths to map them to a file structure


In [None]:
fs.parse_files(remote_files)

set()

Let's examinate our file structure

Starting from the root of the fs.


In [None]:
fs

Unnamed: 0,path,description,type,exists
common,,"All the information about device, not specific...",group,
camera_wide,,Files related to wide camera (_2 suffix) - mai...,group,
camera_ultrawide,,Files related to ultrawide camera (no suffix) ...,group,


Choose some group by accessing the group's name attribute


In [None]:
fs["camera_wide"]

Unnamed: 0,path,description,type,exists
video,video_2,Video captured by the wide camera (main camera...,file,True
timestamps,times_full_2.json,Timestamps of frames captured by the wide camera,file,True
timestamps_old,times_2.txt,Timestamps of frames captured by the wide came...,file,True
raw_data,^video_data_\d+\.data_2$,Raw data captured by the wide camera,pattern,


We could use hierarchical keys to get values from a hierarchical file structure


In [None]:
fs["camera_wide", "raw_data"]

Unnamed: 0,path,description,type,exists
video_data_6.data_2,video_data_6.data_2,Batch of raw video data,file,True
video_data_8.data_2,video_data_8.data_2,Batch of raw video data,file,True
video_data_3.data_2,video_data_3.data_2,Batch of raw video data,file,True
video_data_21.data_2,video_data_21.data_2,Batch of raw video data,file,True
video_data_4.data_2,video_data_4.data_2,Batch of raw video data,file,True
video_data_20.data_2,video_data_20.data_2,Batch of raw video data,file,True
video_data_2.data_2,video_data_2.data_2,Batch of raw video data,file,True
video_data_12.data_2,video_data_12.data_2,Batch of raw video data,file,True
video_data_0.data_2,video_data_0.data_2,Batch of raw video data,file,True
video_data_14.data_2,video_data_14.data_2,Batch of raw video data,file,True


We could get a file itself. Look at the top of the representation, where FSNode is represented as `FSNode: [$type] $name: $description` and internal content


In [None]:
fs["camera_wide", "raw_data", "video_data_7.data_2"]

Unnamed: 0,path,description,type,exists
0,video_data_7.data_2,Batch of raw video data,file,True


We could get a flat structure of all files (only files without other types of nodes) inside parsed file structure


In [None]:
files = fs.get_files(only_exists=True)
files

Unnamed: 0,path,description,type,exists
device,device.txt,"Device information, camera params, etc.",file,True
gps,gps.csv,GPS information from the device,file,True
heading,heading.csv,Heading (direction) information from the device,file,True
motion,motion.csv,"Motion information from the device, e.g. accel...",file,True
snapshots,snapshots.zip,Snapshots of the video for composition,file,True
...,...,...,...,...
video_data_20.data,video_data_20.data,Batch of raw video data,file,True
video_data_3.data,video_data_3.data,Batch of raw video data,file,True
video_data_16.data,video_data_16.data,Batch of raw video data,file,True
video_data_6.data,video_data_6.data,Batch of raw video data,file,True


We could acces the properties of the FSNode directly


In [None]:
files["device"].path, files["device"].description, files["device"].type

('device.txt', 'Device information, camera params, etc.', 'file')

---


In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()