# InfoGroup data

> Process and prepare InfoGroup dataset.

## Processing

Starting from original CSV files.

- Convert to unicode
- Validate against JSON schema. A few erroneous data entries are erased here (e.g. text in numerical column). Existing implementation uses datapackage validator and takes several days with single core.
- Save to disk in parquet format.
- Provide interface to load single year of data. Allow filtering, column selection and small (optionally random) sample.

In [None]:
#default_exp infogroup

In [None]:
#export
import json
import gzip
import shutil

import numpy as np
import pandas as pd
import fastparquet
from IPython import display

from rurec import resources
from rurec.resources import Resource
from rurec import util

In [None]:
for y in range(1997, 2018):
    resources.add(Resource(f'infogroup/csv/{y}', f'/InfoGroup/data/processed/{y}.csv', f'Processed InfoGroup data, {y}, CSV format', False))
    resources.add(Resource(f'infogroup/schema/{y}', f'/InfoGroup/data/processed/{y}_schema.json', f'Processed InfoGroup data, {y}, schema', False))
    resources.add(Resource(f'infogroup/pq/{y}', f'/InfoGroup/data/processed/{y}.pq', f'Processed InfoGroup data, {y}, parquet format', False))
    resources.add(Resource(f'infogroup/orig/{y}', f'/InfoGroup/data/original/raw/{y}_Business_Academic_QCQ.csv', f'Original unprocessed InfoGroup data, {y}', False))

## Clear and validate raw data

- Change "latin-1" encoding to "utf-8", remove double quotes around values.
- Remove double quotes around every value.
- Rename columns to ALL_CAPS.
- Correct 2-digit state part of the FIPS code.
- Correct missing CBSA code and CBSA level, mainly in 2009.
- In 2009: pad string fields with zeroes.
- Validate values format, replace errors with missing values.


### Validate values in columns

Check when values do not satisfy given constraints, fill with missing and report in a log.

Constraints:
- STATE: enum
- ZIP: 5 digits
- ZIP4: 4 digits
- COUNTY_CODE: 3 digits, equals to last 3 digits of FIPS_CODE
- AREA_CODE: 3 digits
- ID_CODE: enum
- SIC, SIC0-SIC4: 6 digits
- NAICS: 8 digits
- YEAR: equals data file year
- YP_CODE: 5 digits
- EMPLOYEES, SALES, PARENT_EMPLOYEES, PARENT_SALES: non-negative integer
- EMPLOYEES_CODE, SALES_CODE, PARENT_EMPLOYEES_CODE, PARENT_SALES_CODE: enum
- BUSINESS_STATUS: enum
- YEAR_EST: > 1700, <= YEAR
- OFFICE_SIZE_CODE: enum
- HOLDING_STATUS: enum
- ABI: unique, notnull, 9 digits
- SUBSIDIARY_NUMBER: 9 digits
- PARENT_NUMBER: 9 digits
- SITE_NUMBER: 9 digits
- ADDRESS_TYPE: enum
- POPULATION_CODE: enum
- CENSUS_TRACT: 6 digits
- CENSUS_BLOCK: 1 digit
- LATITUDE, LONGITUDE: float, within US bounding box
- MATCH_CODE: enum
- CBSA_CODE: 5 digits
- CBSA_LEVEL: enum
- CSA_CODE: 3 digits
- FIPS_CODE: 5 digits



Potential additional validations:
- codes as enums (SIC, NAICS, FIPS, ...)
- CBSA_LEVEL consistent with CBSA_CODE
- geo variable consistency: lon-lat, nesting of areas

For some variables categories are known in advance: states, size codes etc.

For others, such as city, zip or NAICS it might be benefitial to use categoricals for performance, but if list of categories is taken from data, it might change year to year. If done, this should be done carefully.

In [None]:
#export
def validate_raw_strings(df):
    """Validate values in raw InfoGroup data according to string constraints.
    Return list of dicts of invalid values.
    """
    
    constraints = {
        'STATE': {'cats': ['AK','AL','AR','AZ','CA','CO','CT','DC','DE','FL','GA','HI','IA',
                           'ID','IL','IN','KS','KY','LA','MA','MD','ME','MI','MN','MO','MS',
                           'MT','NC','ND','NE','NH','NJ','NM','NV','NY','OH','OK','OR','PA',
                           'PR','RI','SC','SD','TN','TX','UT','VA','VI','VT','WA','WI','WV','WY']},
        'ZIP': {'number': True, 'nchar': 5},
        'ZIP4': {'number': True, 'nchar': 4},
        'COUNTY_CODE': {'number': True, 'nchar': 3},
        'AREA_CODE': {'number': True, 'nchar': 3},
        'ID_CODE': {'cats': ['1', '2']},
        'SIC': {'number': True, 'nchar': 6},
        'SIC0': {'number': True, 'nchar': 6},
        'SIC1': {'number': True, 'nchar': 6},
        'SIC2': {'number': True, 'nchar': 6},
        'SIC3': {'number': True, 'nchar': 6},
        'SIC4': {'number': True, 'nchar': 6},
        'NAICS': {'number': True, 'nchar': 8},
        'YEAR': {'notna': True, 'number': True},
        'YP_CODE': {'number': True, 'nchar': 5},
        'EMPLOYEES': {'number': True},
        'SALES': {'number': True},
        'PARENT_EMPLOYEES': {'number': True},
        'PARENT_SALES': {'number': True},
        'EMPLOYEES_CODE': {'cats': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']},
        'SALES_CODE': {'cats': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']},
        'PARENT_EMPLOYEES_CODE': {'cats': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']},
        'PARENT_SALES_CODE': {'cats': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']},
        'BUSINESS_STATUS': {'cats': ['1', '2', '3', '9']},
        'YEAR_EST': {'number': True},
        'OFFICE_SIZE_CODE': {'cats': ['A', 'B', 'C', 'D', 'E', 'F']},
        'HOLDING_STATUS': {'cats': ['0', '1', '2']},
        'ABI': {'unique': True, 'notna': True, 'number': True, 'nchar': 9},
        'SUBSIDIARY_NUMBER': {'number': True, 'nchar': 9},
        'PARENT_NUMBER': {'number': True, 'nchar': 9},
        'SITE_NUMBER': {'number': True, 'nchar': 9},
        'ADDRESS_TYPE': {'cats': ['F', 'G', 'H', 'M', 'P', 'R', 'S', 'N']},
        'POPULATION_CODE': {'cats': ['1', '5', '6', '7', '8', '9']},
        'CENSUS_TRACT': {'number': True, 'nchar': 6},
        'CENSUS_BLOCK': {'number': True, 'nchar': 1},
        'LATITUDE': {'number': True},
        'LONGITUDE': {'number': True},
        'MATCH_CODE': {'cats': ['0', '2', '4', 'P', 'X']},
        'CBSA_CODE': {'number': True, 'nchar': 5},
        'CBSA_LEVEL': {'cats': ['1', '2']},
        'CSA_CODE': {'number': True, 'nchar': 3},
        'FIPS_CODE': {'number': True, 'nchar': 5}
    }
    return util.validate_values(df, constraints)


def convert_dtypes(df):
    """Inplace convert string columns to appropriate types."""
    
    for col in ['YEAR', 'EMPLOYEES', 'SALES', 'PARENT_EMPLOYEES', 'PARENT_SALES', 'YEAR_EST', 'LATITUDE', 'LONGITUDE']:
        df[col] = pd.to_numeric(df[col])
        
    cat_cols = {
        'STATE': ['AK','AL','AR','AZ','CA','CO','CT','DC','DE','FL','GA','HI','IA',
                   'ID','IL','IN','KS','KY','LA','MA','MD','ME','MI','MN','MO','MS',
                   'MT','NC','ND','NE','NH','NJ','NM','NV','NY','OH','OK','OR','PA',
                   'PR','RI','SC','SD','TN','TX','UT','VA','VI','VT','WA','WI','WV','WY'],
        'ID_CODE': ['1', '2'],
        'EMPLOYEES_CODE': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'],
        'SALES_CODE': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'],
        'PARENT_EMPLOYEES_CODE': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'],
        'PARENT_SALES_CODE': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K'],
        'BUSINESS_STATUS': ['1', '2', '3', '9'],
        'OFFICE_SIZE_CODE': ['A', 'B', 'C', 'D', 'E', 'F'],
        'HOLDING_STATUS': ['0', '1', '2'],
        'ADDRESS_TYPE': ['F', 'G', 'H', 'M', 'P', 'R', 'S', 'N'],
        'POPULATION_CODE': ['1', '5', '6', '7', '8', '9'],
        'MATCH_CODE': ['0', '2', '4', 'P', 'X'],
        'CBSA_LEVEL': ['1', '2']
    }
        
    for col, cats in cat_cols.items():
        df[col] = pd.Categorical(df[col], categories=cats)


def validate_raw_numbers(df):
    """Validate values in raw InfoGroup data according to numerical constraints.
    Return list of dicts of invalid values.
    """
    
    constraints = {
        'YEAR': {'eq': year},
        'EMPLOYEES': {'ge': 0},
        'SALES': {'ge': 0},
        'PARENT_EMPLOYEES': {'ge': 0},
        'PARENT_SALES': {'ge': 0},
        'YEAR_EST': {'ge': 1500, 'le': year},
        'LATITUDE': {'ge': 0, 'le': 90},
        'LONGITUDE': {'ge': -180, 'le': 0}
    }
    return util.validate_values(df, constraints)

In [None]:
year = 2011
sch = json.load(resources.get(f'infogroup/schema/{year}').path.open())
df = pd.read_csv(resources.get(f'infogroup/orig/{year}').path, dtype='str', encoding='latin-1')

df.rename(columns={x['originalName']: x['name'].upper() for x in sch['fields']}, inplace=True)

invalid_values = validate_raw_strings(df)
convert_dtypes(df)
invalid_values += validate_raw_numbers(df)

df.to_csv(resources.get(f'infogroup/csv/{year}').path, index=False)
fastparquet.write(resources.get(f'infogroup/pq/{year}').path, df, write_index=False)

In [None]:
#export
def get_df(year, cols=None):
    """Return one year of InfoGroup data with appropriate data types.
    Subset of columns can be loaded by passing list to `cols`.
    """
    res = resources.get(f'infogroup/pq/{year}')
    return pd.read_parquet(res.path, 'fastparquet', columns=cols)

# Tests

In [None]:
# infogroup: county_code == fips_code[2:]