In [2]:
#%pip install "dask[complete]"

In [10]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import time

### Load Official HS Code Dictionary 

In [11]:
#load hscode dictionary - 6 digit rolled up
hs_code = pd.read_csv('/data/common/trade_data/HS/hs_code_2019_final.csv', dtype='str')
print(hs_code.head())

  HS_Code                                 Merged_Description
0  010121  Live horses, asses, mules and hinnies ;Horses ...
1  010129  Live horses, asses, mules and hinnies ;Horses ...
2  010130       Live horses, asses, mules and hinnies ;Asses
3  010190  Live horses, asses, mules and hinnies ;Other ;...
4  010221  Live bovine animals ;Cattle ;Purebred breeding...


In [12]:
def cleanup_hs_code(hscode):
    tempvar = str.replace(hscode, ' ', '')
    if len(tempvar.split(',')) > 1:
        return None
    else: return tempvar

In [13]:
basePath = '/data/common/trade_data/2019_updated/us_customs_2019_raw_data_FULLDESC/'

In [14]:
os.listdir(basePath)

['US_Imp_Jun_2019_FULL.parq',
 'US_Imp_Jul_2019_FULL.parq',
 'US_Imp_Mar_2019_FULL.parq',
 'US_Imp_Nov_2019_FULL.parq',
 'US_Imp_Sep_2019_FULL.parq',
 'US_Imp_May_2019_FULL.parq',
 'US_Imp_Feb_2019_FULL.parq',
 'US_Imp_Jan_2019_FULL.parq',
 'US_Imp_Apr_2019_FULL.parq',
 'US_Imp_Dec_2019_FULL.parq',
 'US_Imp_Aug_2019_FULL.parq',
 'US_Imp_Oct_2019_FULL.parq']

In [15]:
for filename in (os.listdir(basePath)):
    print(filename)
    newfn = filename.split('_FULL')[0]
    #Load data
    import_df = pd.read_parquet(basePath + filename)#, engine='fastparquet')
    
    #Cleanup whitespace around hs code - set to null if multiple in column
    import_df['Cleaned_HS_Code'] = [x if x is None else cleanup_hs_code(x) for x in import_df['HS Code']]
    
    #Join to official HS Code List
    tempdf = import_df.merge(hs_code, how='left', left_on='Cleaned_HS_Code', right_on='HS_Code')
    print("Number of not null hscodes: {}", len(tempdf) - len(tempdf[tempdf['HS_Code'].isna()]))
    print("Number of null hscodes: {}", len(tempdf[tempdf['HS_Code'].isna()]))
    
    tempdf = tempdf.dropna(subset=['HS_Code'])
    tempdf.to_parquet('/data/common/trade_data/2019_updated/us_customs_2019_cleaned_ignore_multiple_hscode_FULLDESC/parquet_by_month/' + 
                                                 newfn + '_ignore_multiple_hscode.parq')

US_Imp_Jun_2019_FULL.parq
Number of not null hscodes: {} 910817
Number of null hscodes: {} 119292
US_Imp_Jul_2019_FULL.parq
Number of not null hscodes: {} 1007159
Number of null hscodes: {} 136424
US_Imp_Mar_2019_FULL.parq
Number of not null hscodes: {} 825713
Number of null hscodes: {} 104688
US_Imp_Nov_2019_FULL.parq
Number of not null hscodes: {} 866595
Number of null hscodes: {} 114788
US_Imp_Sep_2019_FULL.parq
Number of not null hscodes: {} 928058
Number of null hscodes: {} 127402
US_Imp_May_2019_FULL.parq
Number of not null hscodes: {} 974555
Number of null hscodes: {} 126773
US_Imp_Feb_2019_FULL.parq
Number of not null hscodes: {} 829688
Number of null hscodes: {} 111396
US_Imp_Jan_2019_FULL.parq
Number of not null hscodes: {} 956816
Number of null hscodes: {} 129173
US_Imp_Apr_2019_FULL.parq
Number of not null hscodes: {} 901688
Number of null hscodes: {} 115480
US_Imp_Dec_2019_FULL.parq
Number of not null hscodes: {} 872252
Number of null hscodes: {} 113411
US_Imp_Aug_2019_FUL