In [1]:
# IMPORT - MAKE REF TABLE
# dependencies
from os.path import isdir
from pathlib import Path
import hashlib
import re
import pandas as pd

# support methods
def getfiles(dirname, fext='csv'):
    assert isdir(dirname)
    return [path for path in Path(dirname).rglob(f'*.{fext}')]


def hashid(fname):
    if pd.isna(fname): return None
    with open(fname, 'rb') as f:
        digest = hashlib.file_digest(f, "sha1")
    return digest.hexdigest()[:8]


def get_reftable(files):
    reference = [{
        'fileid': hashid(file),
        'filepath': file,
        'filename': file.name,
        'fileyear': re.findall("Data_([0-9]{4})_", file.name)[0]
    } for file in files]
    return pd.DataFrame(reference)

# main
indir = "../input"
files = getfiles(indir)
ref = get_reftable(files)

In [2]:
### IMPORT - READ DATA FILES
# dependencies
import yaml

# support methods
def readyaml(fname):
    with open(fname, 'r') as f:
        data = yaml.safe_load(f)
    return data


def cleanname(colname):
    return colname.lower()


def readfile(row):
    df = pd.read_csv(row.filepath)
    df.rename(columns={col: cleanname(col) for col in df.columns}, inplace=True)
    keepcols = ('fileyear', 'fileid', 'filename',)
    for col in keepcols: df[col] = row[col]
    return df


def checkasserts(df, rules):
    for col in rules['no_missing']:
        found = df[col].isna().sum()
        assert found == 0, \
            f"expected 100% completion rate for `{col}` but found {found} missing values."
    for col, rate in rules['completion_rate'].items():
        found = df[col].notna().sum() / df.shape[0]
        assert found >= rate, \
            f"expected >= {rate*100:.1f}% completion rate for `{col}` but found {found*100:.1f}%."
    for col, nunique in rules['nunique'].items():
        found = len(df[col].unique())
        assert found == nunique, \
            f"expected {nunique} nunique values for `{col}`"
    for col, rate in rules['unique_rate'].items():
        found = len(df[col].unique()) / df[col].notna().sum()
        assert found >= rate, \
            f"expected >= {rate*100:.1f}% unique rate for `{col}` but found {found*100:.1f}%."
    return 1


def readconcat(ref, rules):
    dfs = [readfile(ref.iloc[i]) for i in range(ref.shape[0])]
    full = pd.concat(dfs)
    checkasserts(df=full, rules=rules)
    return full

# main
rules = readyaml("../hand/rules.yml")
data = readconcat(ref=ref, rules=rules)

ref.filepath = ref.filepath.astype(str)
ref.to_parquet("../output/reference.parquet")
data.to_parquet("../output/data.parquet")

In [3]:
data.sample().T

Unnamed: 0,273249
record_id,7526389
call_type,952
description,INCOMPLETE CALL FOR POLICE
report_created,N
location,4XX REDWOOD AVE
police_district,2
beat,2C
day_of_week,Sat
cleared_by,X
cleared_by_desc,CANCEL


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2088735 entries, 0 to 379016
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   record_id           int64  
 1   call_type           object 
 2   description         object 
 3   report_created      object 
 4   location            object 
 5   police_district     object 
 6   beat                object 
 7   day_of_week         object 
 8   cleared_by          object 
 9   cleared_by_desc     object 
 10  occurrence_date_pt  object 
 11  received_date_pt    object 
 12  dispatch_date_pt    object 
 13  enroute_date_pt     object 
 14  at_scene_date_pt    object 
 15  clear_date_pt       object 
 16  objectid            int64  
 17  x                   float64
 18  y                   float64
 19  fileyear            object 
 20  fileid              object 
 21  filename            object 
dtypes: float64(2), int64(2), object(18)
memory usage: 366.5+ MB
