In [None]:
# default_exp core

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import os
import subprocess
from pathlib import Path

# Raw data

> Original raw data comes in CSV format, every year of data in a separate file. In this module, we construct schema that adheres to [Frictionless Data specificaitons](https://frictionlessdata.io/specs/), validate each table and erase problematic entries.

## Original data

Create symlinks from data location to "./in".

## Validate against schema

That code is in a separate repo, and is possibly outdated. For now download validated and cleaned files from GCS.

```
mkdir -p out/valid
gsutil -m cp -r gs://info-group-corr/* out/valid/
```

The last 2 years (2016 and 2017) are missing.

Another problem is that original data come in unknown encoding. Validated files are saved in UTF-8, which also reduces their byte size.

## Extracts

It is handy to have smaller files for testing. Multiple approaches can be used to extract subsets in longitudinal data.

### First 100k records

In [None]:
# export
 
def extract_100k():
    """Create extract with header and first 100k records."""
    dir_in = Path('./out/valid')
    dir_out = Path('./out/extracts/100k')
    for fn_in in dir_in.glob('*.csv'):
        fn_out = dir_out / fn_in.name
        with open(fn_out, 'w') as fout:
            subprocess.run(['head', '-n', '100001', fn_in], stdout=fout)

In [None]:
# notest
extract_100k()

## File size

To report number of lines, is it faster to iterate through file in Python or use system `wc` utility?

In [None]:
# export
def wc_py(fpath):
    "Return number of lines in a text file, using Python I/O."
    with open(fpath) as f:
        line_count = 0
        for _ in f:
            line_count += 1
    return line_count

def wc_sys(fpath):
    "Return number of lines in a text file, using sytem 'wc' utility."
    p = subprocess.run(['wc', '-l', fpath], capture_output=True, text=True)
    return int(p.stdout.split()[0])

In [None]:
# hide
fpath = './README.md'
assert wc_py(fpath) == wc_sys(fpath)

In [None]:
# notest
fpath = './out/valid/2000.csv'
%time wc_py(fpath)
%time wc_sys(fpath)

CPU times: user 9 s, sys: 1.47 s, total: 10.5 s
Wall time: 10.4 s
CPU times: user 6.76 ms, sys: 0 ns, total: 6.76 ms
Wall time: 1.3 s


11169277

So it is faster to use sytem `wc` utility.

In [None]:
# export
from hurry.filesize import size

def lsdir(fdir):
    """Return list of strings like "file_name file_size number_of_lines" for all files in :fdir:."""
    fpaths = []
    for fname in os.listdir(fdir):
        fpath = os.path.join(fdir, fname)
        if not os.path.isfile(fpath):
            continue
        fpaths.append(fpath)
    
    info = ['Name\tLines\tSize']
    for fpath in sorted(fpaths):
        wc = wc_sys(fpath)
        sz = size(os.path.getsize(fpath))
        info.append(f'{fpath}\t{wc}\t{sz}')
    return info
