In [None]:
# default_exp core

# Data

> Original raw data comes in CSV format, every year of data in a separate file. In this module, we construct schema that adheres to [Frictionless Data specificaitons](https://frictionlessdata.io/specs/), validate each table and erase problematic entries.

In [None]:
# export
import os
import subprocess
from pathlib import Path

## Original data

Create symlinks from data location to "./in".

## Preprocessing

### Convert to UTF-8

Original data come in unknown encoding. Make best guess and save in UTF-8.

### Validate against schema

[Schema spec](https://frictionlessdata.io/specs/table-schema/)

That code is in a separate repo, and is possibly outdated. For now download validated and cleaned files from GCS.

```
mkdir -p data/csv
gsutil -m cp -r gs://info-group-corr/*.csv data/csv/
```

The last 2 years (2016 and 2017) are missing.

The schema is a separate JSON file, and can be used to assign correct data types when importing into BigQuery, Stata, pandas or any other dtype aware format.

In [None]:
# notest
# hide

# Separate schemas into separate files, one file per year.
# Some columns are missing in certain years, and this is captured in "field_lists" fields of the JSON schema.

import json

def make_schema(y):
    '''Prepare year-specific schema.'''
    with open('data/csv/schema.json') as f:
        schema = json.load(f)
    for fl in schema['field_lists']:
        if y in fl['years']:
            used_fields = fl['fields']
            break
    new_fields = []
    for f in schema['fields']:
        if f['name'] in used_fields:
            new_fields.append(f)
    schema['fields'] = new_fields
    del schema['field_lists']
    return schema

for y in range(1997, 2016):
    with open(f'data/csv/{y}_schema.json', 'w') as f:
        json.dump(make_schema(y), f)

## Extracts

It is handy to have smaller files for testing. Multiple approaches can be used to extract subsets in longitudinal data.

### First 100k records

This is probably unneccessary, because `pd.read_csv(nrows=100000)` does just that.

In [None]:
# export
 
def extract_100k():
    """Create extract with header and first 100k records."""
    dir_in = Path('./out/valid')
    dir_out = Path('./out/extracts/100k')
    for fn_in in dir_in.glob('*.csv'):
        fn_out = dir_out / fn_in.name
        with open(fn_out, 'w') as fout:
            subprocess.run(['head', '-n', '100001', fn_in], stdout=fout)

In [None]:
# notest
extract_100k()