In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
import datapackage
import goodtables
import tableschema
import json
import hashlib
import urllib
import datetime
import pandas as pd
from pprint import pprint

from pudl import init, datastore, settings
from pudl.helpers import fix_int_na
import pudl.constants as pc

In [43]:
# Remote file path:
#msha_data_path = pc.base_data_urls["mshamines"]
# Local file path:
msha_data_path = os.path.join(settings.DATA_DIR, "msha")
msha_data_pkg_dir = os.path.join(settings.PUDL_DIR,"results","data_pkgs","msha")
input_dir = os.path.join(msha_data_pkg_dir, "input")

# Create a data package to contain our resources.
msha_pkg = datapackage.Package(os.path.join(input_dir,"datapackage.json"))
# Generate an output directory based on the name of the data package
output_dir = os.path.join(msha_data_pkg_dir, msha_pkg.descriptor['name'])
os.makedirs(os.path.join(output_dir,"data"), exist_ok=True)

# Dictionary of MSHA data files under msha_data_dir
msha_resources = {
    "mines": {
        "data": "Mines.zip",
        "defs": "Mines_Definition_File.txt"
    },
    "controller-operator-history": {
        "data": "ControllerOperatorHistory.zip",
        "defs": "Controller_Operator_History_Definition_File.txt"
    },
    "employment-production-quarterly": {
        "data": "MinesProdQuarterly.zip",
        "defs": "MineSProdQuarterly_Definition_File.txt"
    }
#   "contractor-employment-production-quarterly": {
#       "data": "ContractorProdQuarterly.zip",
#       "defs": "ContractorProdQuarterly_Definition_File.txt"
#   }
}

for res in msha_resources:
    # Create dataframes from input data & definition files (local or remote):
    for d in ['data','defs']:
        msha_resources[res][f"{d}_df"] = pd.read_csv(f"{msha_data_path}/{msha_resources[res][d]}",
                                               delimiter="|",
                                               encoding="iso-8859-1")
    # Read the input tabular data resource JSON file we've prepared
    msha_resources[res]["json"] = json.load(open(os.path.join(input_dir,f"{res}.json")))
    
# OMFG even the MSHA data is broken. *sigh*
msha_resources["employment-production-quarterly"]["data_df"].columns = \
    list(msha_resources["employment-production-quarterly"]["defs_df"]['COLUMN_NAME'])

for res in msha_resources:
    # Convert the definitions to a dictionary of field descriptions
    field_desc = msha_resources[res]["defs_df"].set_index('COLUMN_NAME').to_dict()['FIELD_DESCRIPTION']

    # Set the description attribute of the fields in the schema using field descriptions.
    for field in msha_resources[res]["json"]["schema"]["fields"]:
        field['description'] = field_desc[field['name']]
    msha_resources[res]["resource"] = datapackage.Resource(descriptor=msha_resources[res]["json"])
    
    # Make sure we didn't miss or re-name any fields accidentally
    json_fields = msha_resources[res]["resource"].schema.field_names
    defs_fields = list(msha_resources[res]["defs_df"]['COLUMN_NAME'])
    data_fields = list(msha_resources[res]['data_df'].columns)
    assert json_fields == defs_fields, "json vs. defs missing field: {}".format(set(json_fields).symmetric_difference(set(defs_fields)))
    assert data_fields == defs_fields, "data vs. defs missing field: {}".format(set(data_fields).symmetric_difference(set(defs_fields)))
    msha_resources[res]["resource"].infer()
    msha_resources[res]["resource"].commit()
    
    # Need to clean up the integer NA values in the data before outputting:
    for field in msha_resources[res]["resource"].schema.field_names:
        if msha_resources[res]["resource"].schema.get_field(field).type == 'integer':
            msha_resources[res]["data_df"][field] = fix_int_na(msha_resources[res]["data_df"][field])

    # Force boolean values to use canonical True/False values.
    for field in msha_resources[res]["resource"].schema.field_names:
        if msha_resources[res]["resource"].schema.get_field(field).type == 'boolean':
            msha_resources[res]["data_df"][field] = msha_resources[res]["data_df"][field].replace('Y',True)
            msha_resources[res]["data_df"][field] = msha_resources[res]["data_df"][field].replace('N',False)

    # the data itself goes in output -- this is what we're packaging up
    output_csv = os.path.join(output_dir,"data",f"{res}.csv")
    msha_resources[res]['data_df'].to_csv(output_csv, index=False)
    
    # calculate some useful information about the output file, and add it to the resource:
    # resource file size:
    msha_resources[res]["resource"].descriptor["bytes"] = os.path.getsize(output_csv)
    
    # resource file hash:
    BLOCKSIZE = 65536
    hasher = hashlib.sha1()
    with open(output_csv, 'rb') as afile:
        buf = afile.read(BLOCKSIZE)
        while len(buf) > 0:
            hasher.update(buf)
            buf = afile.read(BLOCKSIZE)

    msha_resources[res]["resource"].descriptor["hash"] = f"sha1:{hasher.hexdigest()}"

    # Check our work...
    if not msha_resources[res]["resource"].valid:
        print(f"TABULAR DATA RESOURCE {res} IS NOT VALID.")
    
    # Add the completed resource to the data package
    msha_pkg.add_resource(descriptor=msha_resources[res]["resource"].descriptor)

# Automatically fill in some additional metadata
msha_pkg.infer();

# Timestamp indicating when packaging occured
msha_pkg.descriptor['created'] = datetime.datetime.utcnow().replace(microsecond=0).isoformat()+'Z'
# Have to set this to 'data-package' rather than 'tabular-data-package' due to a DataHub.io bug
msha_pkg.descriptor['profile'] = 'data-package'
msha_pkg.commit()

# save the datapackage
if not msha_pkg.valid:
    print("MSHA DATA PACKAGE IS NOT VALID.")
msha_pkg.save(os.path.join(output_dir,'datapackage.json'));

# Validate some of the data...
report = goodtables.validate(os.path.join(output_dir,'datapackage.json'), row_limit=10000)
if not report['valid']:
    print("MSHA DATA TABLES FAILED TO VALIDATE")
    pprint(report)

In [67]:
url_parts = urllib.parse.urlparse(pc.base_data_urls['mshamines'])
new_path = url_parts.path + '/' + "NewFile.zip"
urllib.parse.urlunparse(list(url_parts[0:2])+[new_path,'','',''])

'https://arlweb.msha.gov/OpenGovernmentData/DataSets/NewFile.zip'