In [None]:
import sys
import os

In [None]:
import inspect
from hashlib import md5
from pathlib import Path
from operator import methodcaller
from itertools import chain
import datetime as dt
from functools import partial
from pprint import pprint  #as pp

In [None]:
from IPython.core.display import HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from matplotlib import pyplot as plt

In [None]:
import click
import attr
from tqdm import tqdm
import numpy as np
from toolz import curry
import pandas as pd
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
from dask.distributed import Client

In [None]:
import binascii
import hashlib
def hash_utf8(string):
    """given utf8 string return md5 hash value as hex string"""
    hasher = hashlib.md5()
    hasher.update(string.encode("utf-8"))
    return binascii.hexlify(hasher.digest()).decode("utf-8")

In [None]:
import logging as log
log.disable(50)

In [None]:
from dask.distributed import LocalCluster
cluster = LocalCluster(processes=False)
client = Client(cluster)
client

In [None]:
client = Client('127.0.0.1:36599')
client

In [None]:
n_workers = len(client.ncores())
n_cores = sum(client.ncores().values())
print(n_workers, n_cores)

In [None]:
bllb_path = str(Path(r"../../../code/python/bllb").resolve())
sys.path.insert(0, bllb_path)
from bllb_logging import *
from bllb import ppiter  #, hash_utf8

LOG_ON = False
LOG_LEVEL = "WARNING"  #"DEBUG"
def start_log(enable=True, lvl='WARNING', std_lib=True):
    log = setup_logging(enable, lvl, std_lib=std_lib)
    log.info('examinator logging started')
    return log
log_on = LOG_ON
log_level = LOG_LEVEL
log = start_log(log_on, log_level, std_lib=True)

In [None]:
def md5_blocks(path, blocksize=1024 * 2048) -> str:
    path = Path(path)
    if not path.is_dir():
        try:
            hasher = md5()
            with path.open('rb') as file:
                block = file.read(blocksize)
                while len(block) > 0:
                    hasher.update(block)
                    block = file.read(blocksize)
            return hasher.hexdigest()
        except Exception as error:
            log.warning(
                f'Error trying to hash item: {str(path)}\nError:\n{error}')
            return
    else:
        dbg(f'Item is a directory and will not be hashed.  {str(path)}')
        return
def glob_paths(path):
    try:
        path = Path(path)
        if path.is_dir():
            return path.rglob('*')
        else:
            return path
    except Exception as error:
        log.warning(error)

In [None]:
def get_stat(path, opt_md5=True, opt_pid=False) -> dict:
    log.debug(path)
    try:
        path = Path(path)
        info = dict([
            _ for _ in inspect.getmembers(path.lstat())
            if not _[0].startswith('_') and not inspect.isbuiltin(_[1])
        ])
        info.update(
            dict([
                (_[0], str(_[1])) for _ in inspect.getmembers(path)
                if '__' not in _[0] and '<' not in str(_[1])
            ]))
        info.update(
            dict([(str(_[0]), methodcaller(_[0])(path))
                  for _ in inspect.getmembers(path)
                  if _[0].startswith('is_') and _[0] != 'is_mount']))
        info['path'] = str(path)
        info['path_hash'] = hash_utf8(str(path))
        info['f_atime'] = dt.datetime.fromtimestamp(info['st_atime'])
        info['f_ctime'] = dt.datetime.fromtimestamp(info['st_ctime'])
        info['f_mtime'] = dt.datetime.fromtimestamp(info['st_mtime'])
        if opt_md5:
            if not path.is_dir():
                try:
                    md5_hash = md5_blocks(path)
                    info['md5'] = md5_hash
                except:
                    log.warning(f'Could not hash item: {str(path)}')
            else:
                log.debug(f'Item is a directory and will not be hashed.  {str(path)}'
                    )
        if opt_pid:
            log.debug(f"working using OS pid: {os.getpid()}, opt_pid: {opt_pid}")
        return info
    except Exception as error:
        log.warning(error)
        return {'path': str(path)}

In [None]:
path = Path('.')
get_stat(path)

In [None]:
def flatten(lists):
    return reduce(lambda res, x: res + (flatten(x) if isinstance(x, list) else [x]), lists, [])

In [None]:
from functools import reduce

In [None]:
%%time
basepaths = ['..']
opt_md5=False
#def proc_paths(basepaths, opt_md5=True):
"""proc_paths uses Dask client to map path_stat over basepaths."""
paths = chain.from_iterable(map(glob_paths, basepaths))
pstat = partial(get_stat, opt_md5=opt_md5, opt_pid=True)
results = client.map(pstat, paths)
data = [_.result() for _ in results]
ddf = dd.from_pandas(pd.DataFrame(data), npartitions=4)
df = ddf.compute()
#df['idx'] = df.index
#df['path_hash'] = df.path.map(str).map(hash_utf8)
#times = df.loc[:, ['idx', 'path', 'f_ctime', 'f_mtime', 'f_atime']].melt(id_vars=['idx', 'path'])

In [None]:
path = Path('..')
basepaths = [str(path)]
def proc_item(path):
    return [*map(str, path.iterdir())] + [*map(proc_item, filter(Path.is_dir, path.iterdir()))]
result = proc_item(path)
print(len([*path.rglob('*')]))
print(len(result))
results = [*result]
print(len(results))
final = flatten(results)
print(len(final))

In [None]:
def flatten(lists):
    return reduce(lambda res, x: res + (flatten(x.iterdir()) if x.is_dir() else [str(x)]), lists, [])
flatten(Path('.').iterdir())

In [None]:
is_iter = lambda item: item.is_dir()
rfunc = lambda res, x: res + (flatten(x.iterdir()) if is_iter(x) else [str(x)])
def flatten(iterator):
    return reduce(rfunc, iterator, [])
flatten(Path('.').iterdir())

In [None]:
import operator
is_iter = lambda item: item.is_dir()
get_kids = lambda parent: parent.iterdir()
get_val = lambda item: flatten(get_kids(item)) if is_iter(item) else [item]
def flatten(iterator):
    results = []
    for i in iterator:
        results = partial(operator.add, results)(get_val(i))
    return results
[*flatten(Path('.').iterdir())]

In [None]:
def get_dir(d):
    path = Path(d)
    if path.is_dir():
        return [str(_) for _ in path.iterdir()]
get_dir('.')

In [None]:
from queue import Queue
from threading import Thread

def multiplex(n, q, **kwargs):
    """ Convert one queue into several equivalent Queues

    >>> q1, q2, q3 = multiplex(3, in_q)
    """
    out_queues = [Queue(**kwargs) for i in range(n)]
    def f():
        while True:
            x = q.get()
            for out_q in out_queues:
                out_q.put(x)
    t = Thread(target=f)
    t.daemon = True
    t.start()
    return out_queues

def push(in_q, out_q):
    while True:
        x = in_q.get()
        out_q.put(x)

def merge(*in_qs, **kwargs):
    """ Merge multiple queues together

    >>> out_q = merge(q1, q2, q3)
    """
    out_q = Queue(**kwargs)
    threads = [Thread(target=push, args=(q, out_q)) for q in in_qs]
    for t in threads:
        t.daemon = True
        t.start()
    return out_q

# TODO:
- Progress display
 - Create additional copy q, thread func to count items in q
 - Terminate main loop and threads upon completion rather than count down/time out
 - itertoolz.count glob?
- Progress persistence, resumuption
- Result persistence, db storage

In [None]:
from queue import Queue
from threading import Thread
from time import sleep

In [None]:
q = Queue()
remote_q = client.scatter(q)
q1, q2 = multiplex(2, remote_q)
list_q = client.map(get_dir, q1)
l_q = client.gather(list_q)

opt_md5 = True

pstat = partial(get_stat, opt_md5=opt_md5, opt_pid=False)
q3 = client.map(pstat, q2)
q4, q5 = multiplex(2, q3)
result_q = client.gather(q4)

qs = [q, remote_q, q1, q2, list_q, l_q, q3, q4, result_q]

In [None]:
def load_dir(from_q, to_q, stop=False):
    limit = 300
    i = limit
    while True and ((i and not stop()) or from_q.qsize()):
        if from_q.qsize():
            l = from_q.get()
            if isinstance(l, list):
                for item in l:
                    to_q.put(item)
            i = min(i+1, limit)
        else:
            i -= 1
            sleep(.1)
    if stop():
        print('load_dir stopped.')
    elif not i:
        print('load_dir stopped by i')

In [None]:
def unloadq(q, stop, limit=2000, rest=.1, check=100):
    i = limit
    loops = 0
    results = []
    while True and ((i and not stop()) or q.qsize()):
        loops += 1
        if loops % check == 0:
            print(i, loops, len(results))
        if q.qsize():
            x = q.get()
            #print(x)
            results.append(x)
            i = min(i+1, limit)
        else:
            i -= 1
            if i % check == 0:
                print(i)
            sleep(rest)
    if stop():
        print('Unloadq stopped.')
    elif not i:
        print('Unloadq stopped by i')
    return results

In [None]:
# Troubleshooting problem with thread not running

#from concurrent.futures import ThreadPoolExecutor
stop_threads = False
stop = lambda: stop_threads
basepaths = ['.']
#t = ThreadPoolExecutor()
thread = Thread(target=load_dir, args=(l_q, q, stop,), daemon = True)
thread.start()
i = 100
while (True or l_q.qsize()) and i:
    item = l_q.get()
    if item:
        i = min(i+1, 100)
    print(item)
    i = max(i-1, 0)
    if i % 10 == 0:
        print(l_q.qsize(), i)
    sleep(.1)
stop_threads = True

In [None]:
#load_thread = Thread(target=load_dir, args=(l_q, q,), daemon = True)
#load_thread.start()

from concurrent.futures import ThreadPoolExecutor
basepaths = ['.']
with ThreadPoolExecutor() as t:
    stop_threads = False
    stop = lambda: stop_threads
    t_load_dir = t.submit(load_dir, l_q, q, stop)
    print(t_load_dir.running())
    [q.put(str(Path(path).resolve())) for path in basepaths]
    #l_q.put(basepaths)
    results_future = t.submit(unloadq, result_q, stop, limit=300)
    ilimit = 10
    i = ilimit
    while True and i or alive:
        alive = sum([_q.qsize() for _q in qs])
        if alive:
            i = min(i+1, ilimit)
            print(alive, i)
            print(t_load_dir.running())
        else:
            i -= 1
            print(f'i: {i}')
        sleep(.1)
    stop_threads = True
    #results_list = unloadq(result_q, limit=300)
    results_list = results_future.result()
    results = pd.DataFrame(results_list)
    print(results.info())
#t.shutdown(False)
#del(load_thread)
print(q5.qsize())

In [None]:
[_q.qsize() for _q in qs]

In [None]:
# Attempting to utilize Dask df to handle, export data

def iterq(q):
    while q.qsize():
        yield q.get()

print(q5.qsize())
result_count = q5.qsize()
data = client.gather(q5)
while data.qsize() < result_count:
    print('sleeping')
    sleep(.1)
print(data.qsize())
iterdata = [*iterq(data)]
print(len(iterdata))
df = pd.DataFrame(iterdata)
print(len(df))
ddf = dd.from_pandas(df, npartitions=4)
remote_ddf = client.scatter(ddf)
remote_result = remote_ddf.result()
remote_result.to_csv('./export4-*.csv')
new_ddf = dd.read_csv('./export4-*.csv')
new_ddf.compute()

In [None]:
%time
from toolz import itertoolz
print(itertoolz.count(Path('../..').rglob('*')))

In [None]:
times = ['f_ctime', 'f_mtime', 'f_atime']
results.index.name = 'id'
results['id'] = results.index
cols = list(set(results.columns) - set(times))
print(len(cols))
print(cols)
stacked = pd.melt(results, id_vars=cols, value_vars=times, value_name='time').set_index('time')
print(stacked.columns)
unstacked = stacked.loc[:, ['variable']].groupby('variable').resample('D').count().unstack('variable')
unstacked.plot(kind='bar', stacked=True)

In [None]:
len(os.listdir('..'))
os.listdir('..')

In [None]:
def r_listdir(src):
    names = os.listdir(src)
    results = []
    for name in names:
        srcname = os.path.join(src, name)
        if os.path.isdir(srcname):
            results.extend(r_listdir(srcname))
        else:
            results.append(srcname)
    return results
len(r_listdir('.'))

In [None]:
len([*os.scandir('..')])

In [None]:
def r_scandir(src):
    names = os.scandir(src)
    results = []
    for name in names:
        if os.path.isdir(name):
            results += r_scandir(name)
        else:
            results.append(name.path)
    return results
len(r_scandir('.'))

In [None]:
import glob

In [None]:
results = glob.glob('**/?*', recursive=True)
len(results)
results

In [None]:
from os import walk

def r_walk(mypath):
    f = []
    for (dirpath, dirnames, filenames) in walk(mypath):
        f.extend([os.path.join(dirpath, name) for name in filenames])
    return f
results = r_walk('.')
len(results)
results

In [None]:
import fnmatch

In [None]:
def r_path(path):
    return [str(_) for _ in Path(path).rglob('*') if not _.is_dir()]
len(r_path('.'))

In [None]:
path = '/data/OneDrive/Documents/projects/bookmarks/'
%timeit len(r_listdir(path))
%timeit len(r_scandir(path))
%timeit len(r_walk(path))
%timeit len(r_path(path))

In [None]:
path = '../..'
rw_set = set(r_walk(path))
len(rw_set)
rp_set = set(r_path(path))
len(rp_set)

In [None]:
rw = r_walk(path)
len(rw)
rw_set = set(rw)
len(rw_set)

In [None]:
rp = r_path(path)
len(rp)
rp_set = set(rp)
len(rp_set)

In [None]:
%time len(set(r_listdir(path)))
%time len(set(r_scandir(path)))

In [None]:
rl = r_listdir(path)
rl_set = set(rl)
len(rl_set)
rs = r_scandir(path)
rs_set = set(rs)
len(rs_set)

In [None]:
rw_set - rp_set

In [None]:
mypath = '.'
_, _, filenames = next(walk(mypath), (None, None, []))
len(filenames)
filenames

In [None]:
(_, _, filenames) = next(os.walk(mypath))
len(filenames)
filenames