# Create Bulk Import Output for Alembic Scripts from StatsCan NAICS Data
This notebook contains code snippets to generate output for alembic scripts.

The output generation by default generates data for production purposes.  But if `generate_subset_of_naics_data` is set to `True`, a subset of the codes can be generated for test purposes.  This option was added as unit tests were taking too long to load the production set of naics structures/codes and naics elements.


In [None]:
%run /workspaces/lear/tests/data/common/naics_utils.ipynb

import os
import json
import csv
import chardet
import uuid

In [None]:
naics_structure_filename = '../data/naics/naics-scian-2017-structure-v3-eng.csv'
naics_element_filename = '../data/naics/naics-scian-2017-element-v3-eng.csv'
naics_year = 2017
naics_version = 3
# set to generate_subset_of_naics_data to True if need to generate a subset of NAICS data by filtering on codes.
# This is needed so that lear services/jobs/apis are able to apply alembic migrations scripts quickly when running unit tests.
# Leave as False if all NAICS data needs to be generated.
generate_subset_of_naics_data = False
# only used if generate_subset_of_naics_data is set to True to filter on which codes to generate output for
# for example, codes values of ['112320', '311351', '311911', '311920', '327410', '333248', '335223', '413130', '413190'] contain data relevant to the search term 'roast'
codes_to_add = ['112320', '311351', '311911', '311920', '327410', '333248', '335223', '413130', '413190']

### Create dict from NAICS structure data

In [None]:
char_encoding_result = None
with open(naics_structure_filename, 'rb') as rawdata:
    char_encoding_result = chardet.detect(rawdata.read(100000))
char_encoding_result
assert char_encoding_result
encoding = char_encoding_result['encoding']
assert encoding
encoding

In [None]:
structure_file = open(naics_structure_filename, encoding=encoding)
csvreader = csv.reader(structure_file)
header = []
header = next(csvreader)
header

In [None]:
with open(naics_structure_filename, newline='', encoding=encoding) as csvfile:
    # map custom field names that match database field names
    field_names = ['level', 'hierarchical_structure', 'code', 'class_title', 'superscript', 'class_definition']
    reader = csv.DictReader(csvfile, fieldnames=field_names)
    # The line will skip the first row of the csv file (Header row)
    next(reader)
    structure_dict_arr = []

    try:
        if generate_subset_of_naics_data:
            for row in reader:
                code = row['code']
                if code in codes_to_add:
                    # print(f'code is match: {code}')
                    # add custom properties to data row
                    row['year'] = naics_year
                    row['version'] = naics_version
                    row['naics_key'] = str(uuid.uuid4())
                    structure_dict_arr.append(row)
            print(structure_dict_arr)
        else:
            for row in reader:
                # add custom properties to data row
                row['year'] = naics_year
                row['version'] = naics_version
                row['naics_key'] = str(uuid.uuid4())
                structure_dict_arr.append(row)
            print(structure_dict_arr)
    except csv.Error as e:
        sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e))


### Create dict from NAICS element data

In [None]:
structure_file = open(naics_element_filename)
csvreader = csv.reader(structure_file)
header = []
header = next(csvreader)
header

In [None]:
# note in order for this to work, need to update ~/.jupyter/jupyter_notebook_config.py by adding c.NotebookApp.iopub_data_rate_limit = 10000000
# if not done, an error message is thrown indicating streaming data rate exceeded
with open(naics_element_filename, newline='') as csvfile:
    # map custom field names that match database field names
    field_names = ['level', 'code', 'class_title', 'element_type_label', 'element_description']
    reader = csv.DictReader(csvfile, fieldnames=field_names)
    # The line will skip the first row of the csv file (Header row)
    next(reader)
    element_dict_arr = []

    try:
        if generate_subset_of_naics_data:
            for row in reader:
                code = row['code']
                if code in codes_to_add:
                    # print(f'code is match: {code}')
                    # add custom properties to data row
                    row['year'] = naics_year
                    row['version'] = naics_version
                    row['element_type'] = get_element_type_from_label(row['element_type_label'])
                    del row['element_type_label']
                    element_dict_arr.append(row)
            print(element_dict_arr)
        else:
            for row in reader:
                # add custom properties to data row
                row['year'] = naics_year
                row['version'] = naics_version
                row['element_type'] = get_element_type_from_label(row['element_type_label'])
                del row['element_type_label']
                element_dict_arr.append(row)
            print(element_dict_arr)
    except csv.Error as e:
        sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e))
