In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import time

### Load Official HS Code Dictionary 

In [2]:
#load hscode dictionary - 6 digit rolled up
hs_code = pd.read_csv('/data/common/trade_data/HS/hs_code_2019_final.csv', dtype='str')
print(hs_code.head())

  HS_Code                                 Merged_Description
0  010121  Live horses, asses, mules and hinnies ;Horses ...
1  010129  Live horses, asses, mules and hinnies ;Horses ...
2  010130       Live horses, asses, mules and hinnies ;Asses
3  010190  Live horses, asses, mules and hinnies ;Other ;...
4  010221  Live bovine animals ;Cattle ;Purebred breeding...


### Functions

In [3]:
def explode_multiple_hscode(df_to_explode, colname_to_explode='HS Code'):
    #Cleanup whitespace around hs code
    df_to_explode[colname_to_explode] = [x if x is None else str.replace(x, ' ', '') for x in df_to_explode[colname_to_explode]]
    
    new_col_list = list(df_to_explode.columns)
    
    new_col_list.remove(colname_to_explode)
    
    #Explode by column name specified
    tempdf = (df_to_explode.set_index(new_col_list).apply(lambda x: x.str.split(',').explode()).reset_index())  
    
    #How many extra rows were added
    print("Additional rows added: {}".format(len(tempdf)-len(df_to_explode)))
    
    return tempdf

In [4]:
def join_to_hs_code(df_to_join, hscode_list, leftcol = 'HS Code', rightcol = 'HS_Code'):
    #Join all 2019 data with official rolled up HS code list 
    tempdf = df_to_join.merge(hscode_list, how='left', left_on=leftcol, right_on=rightcol)
    print("Number of not null hscodes: {}", len(tempdf) - len(tempdf[tempdf[rightcol].isna()]))
    print("Number of null hscodes: {}", len(tempdf[tempdf[rightcol].isna()]))
    return tempdf

### Data cleanup for Jan-Dec 2019 Full Desc data

In [5]:
#Initialize final dataframes
df_2019_notnull_FULLDESC = None
df_2019_null_FULLDESC = None

In [6]:
basePath = '/data/common/trade_data/2019_updated/us_customs_2019_raw_data_FULLDESC/'

for filename in os.listdir(basePath):
    print(filename)
    #Load data
    import_df = pd.read_parquet(basePath + filename)#, engine='fastparquet')
    
    #Explode HS Code
    import_df_exploded = explode_multiple_hscode(import_df, 'HS Code')
    
    #Join to official HS Code List
    import_df_exploded_joinedHSCode = join_to_hs_code(import_df_exploded, hs_code)
    
    #Update null and not-null df
    if df_2019_null_FULLDESC is None:
        df_2019_null_FULLDESC = import_df_exploded_joinedHSCode[import_df_exploded_joinedHSCode['HS_Code'].isna()]
    else:
        df_2019_null_FULLDESC = pd.concat([df_2019_null_FULLDESC, import_df_exploded_joinedHSCode[import_df_exploded_joinedHSCode['HS_Code'].isna()]], axis=0)

    if df_2019_notnull_FULLDESC is None:
        df_2019_notnull_FULLDESC =  import_df_exploded_joinedHSCode.dropna(subset=['HS_Code'])
    else:
        df_2019_notnull_FULLDESC = pd.concat([df_2019_notnull_FULLDESC, import_df_exploded_joinedHSCode.dropna(subset=['HS_Code'])], axis=0)

US_Imp_Jun_2019_FULL.parq
Additional rows added: 3809
Number of not null hscodes: {} 916918
Number of null hscodes: {} 117000
US_Imp_Jul_2019_FULL.parq
Additional rows added: 4533
Number of not null hscodes: {} 1014261
Number of null hscodes: {} 133855
US_Imp_Mar_2019_FULL.parq
Additional rows added: 3206
Number of not null hscodes: {} 830749
Number of null hscodes: {} 102858
US_Imp_Nov_2019_FULL.parq
Additional rows added: 3948
Number of not null hscodes: {} 872883
Number of null hscodes: {} 112448
US_Imp_Sep_2019_FULL.parq
Additional rows added: 4248
Number of not null hscodes: {} 934609
Number of null hscodes: {} 125099
US_Imp_May_2019_FULL.parq
Additional rows added: 3873
Number of not null hscodes: {} 980669
Number of null hscodes: {} 124532
US_Imp_Feb_2019_FULL.parq
Additional rows added: 3560
Number of not null hscodes: {} 835353
Number of null hscodes: {} 109291
US_Imp_Jan_2019_FULL.parq
Additional rows added: 3886
Number of not null hscodes: {} 962926
Number of null hscodes: {

```
US_Imp_Jun_2019_FULL.parq
Additional rows added: 3809
Number of not null hscodes: {} 916918
Number of null hscodes: {} 117000
US_Imp_Jul_2019_FULL.parq
Additional rows added: 4533
Number of not null hscodes: {} 1014261
Number of null hscodes: {} 133855
US_Imp_Mar_2019_FULL.parq
Additional rows added: 3206
Number of not null hscodes: {} 830749
Number of null hscodes: {} 102858
US_Imp_Nov_2019_FULL.parq
Additional rows added: 3948
Number of not null hscodes: {} 872883
Number of null hscodes: {} 112448
US_Imp_Sep_2019_FULL.parq
Additional rows added: 4248
Number of not null hscodes: {} 934609
Number of null hscodes: {} 125099
US_Imp_May_2019_FULL.parq
Additional rows added: 3873
Number of not null hscodes: {} 980669
Number of null hscodes: {} 124532
US_Imp_Feb_2019_FULL.parq
Additional rows added: 3560
Number of not null hscodes: {} 835353
Number of null hscodes: {} 109291
US_Imp_Jan_2019_FULL.parq
Additional rows added: 3886
Number of not null hscodes: {} 962926
Number of null hscodes: {} 126949
US_Imp_Apr_2019_FULL.parq
Additional rows added: 3505
Number of not null hscodes: {} 907238
Number of null hscodes: {} 113435
US_Imp_Dec_2019_FULL.parq
Additional rows added: 3689
Number of not null hscodes: {} 878170
Number of null hscodes: {} 111182
US_Imp_Aug_2019_FULL.parq
Additional rows added: 4371
Number of not null hscodes: {} 988178
Number of null hscodes: {} 131536
US_Imp_Oct_2019_FULL.parq
Additional rows added: 4193
Number of not null hscodes: {} 944219
Number of null hscodes: {} 123821
```

In [7]:
print(len(df_2019_notnull_FULLDESC))
print(len(df_2019_null_FULLDESC))

11066173
1432006


```
11066173
1432006

```

In [9]:
#Save to parquet
#df_2019_null_FULLDESC.to_parquet('/data/common/trade_data/2019_updated/us_customs_2019_cleaned_joined_data_FULLDESC/us_customs_2019_FULLDESC_null_hscode.parq')

In [10]:
#Save to parquet
#df_2019_notnull_FULLDESC.to_parquet('/data/common/trade_data/2019_updated/us_customs_2019_cleaned_joined_data_FULLDESC/us_customs_2019_FULLDESC_not_null.parq')