# Script start

In [32]:
from get_file_info import *

In [1]:
# %load get_file_info.py
import sys
import os
import inspect
from hashlib import md5
from pathlib import Path
from functools import partial
import datetime as dt

import numpy as np
import pandas as pd
import attr
from toolz import curry

from dask import compute, delayed
import dask.threaded
import dask.multiprocessing

bllb_path = str(Path(r"../../../code/python/bllb").resolve())
sys.path.insert(0, bllb_path)
from bllb_logging import *

from bllb import pp
from pprint import pprint  #as pp

def start_log(enable=True, lvl='WARNING'):
    log = setup_logging(enable, lvl, std_lib=True)
    log.info('examinator logging started')
    return log

In [2]:
#### Cli.py implementation snips below
LOG_ON = True
LOG_LEVEL = "DEBUG"  #"WARNING"
log_on = LOG_ON
log_level = LOG_LEVEL
log = start_log(log_on, log_level)
dbg = log.debug

INFO:root:Imported standard library logging module.
	 Level: DEBUG
DEBUG:root:log settings
	Enabled:	True
	Level:	DEBUG
	True
INFO:root:examinator logging started


In [23]:
@attr.s
class daskerator(object):
    _DSCH = {
            'd': 'distributed',
            't': 'threads',
            'p': 'processes',
            's': 'synchronous'
    }
    def _get_sched(mp_type) -> str:
        if mp_type in daskerator._DSCH.keys():
            return daskerator._DSCH[mp_type]
        else:
            return mp_type
    mp_type = attr.ib(default='s', type=str, converter=_get_sched, 
                      validator=attr.validators.in_(list(_DSCH.keys()) + list(_DSCH.values())))
    sch_add = attr.ib(default='', type=str)
    @sch_add.validator
    def check_dask_opts(instance, attribute, value):
        if instance.mp_type != 'distributed' and value != '':
            raise ValueError('Only distributed dask can accept scheduler address.')
    _client = attr.ib(default=None)
    def __attrs_post_init__(self):
        if self.mp_type[0] == 'd':
            from dask.distributed import Client
            dbg("Creating distributed client object.")
            if self.sch_add == '':
                dbg("Creating new cluster on localhost.")
                self.client = Client()
            else:
                dbg(f"Existing scheduler address: {self.sch_add}")
                self.client = Client(self.sch_add)
            log.info(self.client)
    @curry
    def run_dask(self, func, iterator):
        dbg(f'Scheduler: {self.mp_type}')
        return compute(*map(delayed(func), iterator),
                       scheduler=self.mp_type)

In [4]:
def md5_blocks(path, blocksize=1024 * 2048) -> str:
    path = Path(path)
    if not path.is_dir():
        try:
            hasher = md5()
            with path.open('rb') as file:
                block = file.read(blocksize)
                while len(block) > 0:
                    hasher.update(block)
                    block = file.read(blocksize)
            return hasher.hexdigest()
        except Exception as error:
            log.warning(f'Error trying to hash item: {str(path)}\nError:\n{error}')
            return
    else:
        dbg(f'Item is a directory and will not be hashed.  {str(path)}')
        return

In [6]:
def get_stat(path, opt_md5=True, opt_pid=False) -> dict:
    path = Path(path)
    info = path.lstat()
    d = dict([
        t for t in inspect.getmembers(info)
        if not t[0].startswith('_') and not inspect.isbuiltin(t[1])
    ])
    d['path'] = path
    d['f_atime'] = dt.datetime.fromtimestamp(d['st_atime'])
    d['f_ctime'] = dt.datetime.fromtimestamp(d['st_ctime'])
    d['f_mtime'] = dt.datetime.fromtimestamp(d['st_mtime'])
    if opt_md5: 
        if not path.is_dir():
            try:
                d['md5'] = md5_blocks(path)
            except:
                log.warning(f'Could not hash item: {str(path)}')
                pass
        else:
            dbg(f'Item is a directory and will not be hashed.  {str(path)}')
    if opt_pid:
        dbg(f"working using OS pid: {os.getpid()}, opt_pid: {opt_pid}")
    return d

def path_stat(path, opt_md5=True) -> pd.DataFrame:
    get_stat2 = lambda path: get_stat(path, opt_md5)
    return pd.DataFrame([*map(get_stat2, Path(path).rglob('*'))])

def path_stat_dask(dsk, path, opt_md5=True, opt_pid=False):
    get_stat2 = lambda path: get_stat(path, opt_md5, opt_pid)
    return pd.DataFrame(dsk.run_dask(get_stat2, Path(path).rglob('*')))

In [30]:
def proc_paths(basepaths, mp_type='s', opt_md5=True):
    dsk = daskerator(mp_type)
    path_stat_dask2 = lambda path: path_stat_dask(dsk, path, opt_md5=True)
    df = pd.concat([*map(path_stat_dask2, basepaths)])
    if len(df):
        return df
    else:
        log.info('No results.')
        return

#### Script end
Functions and scripts above

Implementation and testing below

In [31]:
# Test stat_getter init
df = proc_paths(['.'], mp_type='t', opt_md5=False)
df

DEBUG:root:Scheduler: threads
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-2jeni11p
DEBUG:root:Item is a directory and will not be hashed.  scratch/.ipynb_checkpoints
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-0x_wirpw
DEBUG:root:Item is a directory and will not be hashed.  __pycache__
DEBUG:root:Item is a directory and will not be hashed.  .ipynb_checkpoints
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-xtvsw556
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-fbmqevqz
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-p3ao2mmf/storage
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-cfv9yz6u
DEBUG:root:Item is a directory and will not be hashed.  dask-worker-space/worker-xtvsw556/storage
DEBUG:root:Item is a directory and will not be hashed.  scratch
DEBUG:root:Item is a directory and

Unnamed: 0,f_atime,f_ctime,f_mtime,md5,n_fields,n_sequence_fields,n_unnamed_fields,path,st_atime,st_atime_ns,...,st_dev,st_gid,st_ino,st_mode,st_mtime,st_mtime_ns,st_nlink,st_rdev,st_size,st_uid
0,2019-03-07 15:18:26.182167,2019-03-07 15:18:26.182167,2019-03-07 15:18:26.182167,,19,10,3,.ipynb_checkpoints,1.551972e+09,1551971906182167400,...,107,0,35489000,16895,1.551972e+09,1551971906182167400,2,0,0,0
1,2019-03-06 18:21:30.964837,2019-03-06 18:21:30.964837,2019-03-06 18:21:30.964837,766573da4e0cde2812ffaddf08fa94d4,19,10,3,cli.py,1.551896e+09,1551896490964836500,...,107,0,35489001,33261,1.551896e+09,1551896490964836500,1,0,1847,0
2,2019-03-07 15:25:40.924382,2019-03-07 15:25:40.924382,2019-03-07 15:25:40.924382,d0179cf7e3b7f629c06ac5ff425a3c6d,19,10,3,cli2.py,1.551972e+09,1551972340924382300,...,107,0,35489002,33261,1.551972e+09,1551972340924382300,1,0,1918,0
3,2019-03-07 22:47:23.564116,2019-03-07 22:47:23.490058,2019-03-07 22:47:23.490058,,19,10,3,dask-worker-space,1.551999e+09,1551998843564115800,...,107,0,35488993,16895,1.551999e+09,1551998843490057900,2,0,0,0
4,2019-03-05 16:20:12.639983,2019-03-05 16:20:20.744075,2019-03-05 16:20:12.639983,52c251102e18e17d2e28049a69ad45b9,19,10,3,examinator.py,1.551803e+09,1551802812639982700,...,107,0,35489004,33261,1.551803e+09,1551802812639982700,1,0,4469,0
5,2019-03-06 18:20:45.499000,2019-03-06 18:21:20.102298,2019-03-06 18:21:20.102000,52c251102e18e17d2e28049a69ad45b9,19,10,3,examinator2.py,1.551896e+09,1551896445499000000,...,107,0,35489005,33261,1.551896e+09,1551896480102000000,1,0,4469,0
6,2019-03-07 22:52:36.758177,2019-03-07 22:52:36.758177,2019-03-07 22:52:36.758177,8b4d39a9d9ed5c5c07fb21de28ab9286,19,10,3,get_file_info.py,1.551999e+09,1551999156758176800,...,107,0,35489006,33261,1.551999e+09,1551999156758176800,1,0,4322,0
7,2019-03-06 18:49:40.400976,2019-03-06 18:49:40.400976,2019-03-06 18:49:40.400976,,19,10,3,scratch,1.551898e+09,1551898180400975800,...,107,0,35489007,16895,1.551898e+09,1551898180400975800,2,0,0,0
8,2019-02-25 21:38:53.820684,2019-03-04 21:02:04.337968,2019-02-25 21:38:53.820684,3f2554bb47311b1f34231a7c0f7fe04e,19,10,3,script.py,1.551131e+09,1551130733820684200,...,107,0,35489008,33261,1.551131e+09,1551130733820684200,1,0,681,0
9,2019-03-02 01:42:26.730578,2019-03-04 21:00:47.510710,2019-03-02 01:42:26.730578,86c0079dafa9516d998295f5f01a20c7,19,10,3,test_joblib.py,1.551491e+09,1551490946730578400,...,107,0,35489009,33261,1.551491e+09,1551490946730578400,1,0,1352,0
