In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import time

### Load FULL DESC data for 2019 imports

In [2]:
!ls /data/common/trade_data/2019_updated/us_customs_2019_cleaned_joined_data_FULLDESC

us_customs_2019_FULLDESC_not_null.parq
us_customs_2019_FULLDESC_null_hscode.parq


In [3]:
#Load full dataset without nulls
df_no_null = dd.read_parquet('/data/common/trade_data/2019_updated/us_customs_2019_cleaned_joined_data_FULLDESC/us_customs_2019_FULLDESC_not_null.parq', engine='fastparquet', chunksize="100MB")

In [4]:
len(df_no_null)

11066173

### Load HS Codes
Note: we want HS4 here for 8712 and 8714

In [7]:
#Load hs codes
hs_code = pd.read_csv('/data/common/trade_data/HS/hs_code_2019_final.csv', dtype='str')
hs_code['HTS4'] = [x[0:4] for x in hs_code['HS_Code']]
hs_code

Unnamed: 0,HS_Code,Merged_Description,HTS4
0,010121,"Live horses, asses, mules and hinnies ;Horses ...",0101
1,010129,"Live horses, asses, mules and hinnies ;Horses ...",0101
2,010130,"Live horses, asses, mules and hinnies ;Asses",0101
3,010190,"Live horses, asses, mules and hinnies ;Other ;...",0101
4,010221,Live bovine animals ;Cattle ;Purebred breeding...,0102
...,...,...,...
5516,991921,"Goods of Panama, under the terms of general no...",9919
5517,991961,"Goods of Panama, under the terms of general no...",9919
5518,992004,"Goods of Korea, under the terms of general not...",9920
5519,992038,"Goods of Korea, under the terms of general not...",9920


### Find all HS6 codes within subset of HS4 codes and filter full data for 2019

In [8]:
def subset_full_df_by_hts4_code(hscodes, hts4codes_to_filter=[]):
    unique_hs_code = hs_code[hs_code['HTS4'].isin(hts4codes_to_filter)]['HS_Code'].unique()
    print("There are {} unique hs6 codes in these hts4 code groups.".format(len(unique_hs_code)))
    tempdf = df_no_null[df_no_null['HS Code'].isin(unique_hs_code)] 
    return tempdf

In [19]:
#Specify hs codes
specific_hscodes = ['8712','8714']

#List of unique HS Codes within the HS4 codes specified
unique_hs_code = hs_code[hs_code['HTS4'].isin(specific_hscodes)]['HS_Code'].unique()

In [9]:
#Call function passing in specific hs codes to filter on
df_subset = subset_full_df_by_hts4_code(hs_code, specific_hscodes)

There are 10 unique hs6 codes in these hts4 code groups.


In [10]:
df_subset.head()

Unnamed: 0,System Identity Id,Estimate Arrival Date,Actual Arrival Date,Bill of Lading,Master Bill of Lading,Bill Type Code,Carrier SASC Code,Vessel Country Code,Vessel Code,Vessel Name,...,Product Desc,Marks & Numbers,HS Code Sure Level,CIF,Indicator of true supplier,Indicator of true buyer,END,HS Code,HS_Code,Merged_Description
57,6003201906260000008284,20190227,20190607,EXDO6680020941,OOLU4047460450,H,"EXDO, EXPEDITORS INTERNATIONAL",FR,9299783,CMA CGM TOSCA,...,ALUMINIUM PARTS<br/>,PO#0550685107<br/>,5,0.0,Y,Y,END,871493,871493,Parts and accessories of vehicles of headings ...
484,6003201906260000018152,20190614,20190607,DMALPKGA08540,MEDUMY642529,H,"DMAL, DANMAR LINES LTD",MH,9290464,SEAMAX NORWALK,...,ALUMINIUM EXTRUSION<br/>,NIL MARKS<br/>,5,0.0,N,Y,END,871493,871493,Parts and accessories of vehicles of headings ...
1407,6003201906260000034628,20190622,20190625,OERT110702I00805,APLUAOC0136618,H,OERT,CY,9356294,CMA CGM THALASSA,...,OF ALUMINIUM<br/>OF ALUMINIUM<br/>,NO MARKS<br/>NO MARKS<br/>,5,0.0,Y,Y,END,871493,871493,Parts and accessories of vehicles of headings ...
1450,6003201906260000035497,20190623,20190625,CHQFTJN90016277C,HDMUXGWB3702917,H,"CHQF, CHINA INTERNATIONAL FREIGHT CO LTD",DE,9290555,SANTA LINEA,...,WHEEL<br/>WHEEL<br/>TYRE<br/>TYRE<br/>,NO MARKS<br/>NO MARKS<br/>NO MARKS<br/>NO MARK...,5,0.0,Y,Y,END,871492,871492,Parts and accessories of vehicles of headings ...
1648,6003201906260000038438,20190610,20190625,ABTB2903320298,COSU4521564060,H,ABTB,DE,VIENNA EXPRESS,VIENNA EXPRESS,...,NAUTICAL ACCESSORIES<br/>,UFLEX USA INC.<br/>,5,0.0,Y,Y,END,871420,871420,Parts and accessories of vehicles of headings ...


In [11]:
#Convert to pandas
df_subset_pandas = df_subset.compute()

### Sample subsetted df

In [12]:
#Create an empty pandas dataframe for the sample
df_sample = pd.DataFrame(columns = df_subset_pandas.columns)

In [20]:
#Loop through unique hs code list and sample 1000 from each hs code
for h in unique_hs_code:
    #print(h)
    tempdf = df_subset_pandas[df_subset_pandas['HS Code']==h]
    print("before:", len(tempdf))
    tempdf = tempdf.drop_duplicates('Product Desc')
    print("after:", len(tempdf))
    sample_size = 1000
    if(len(tempdf)<1000):
        sample_size = len(tempdf)
    ret_df = tempdf.sample(n=sample_size, random_state=99)
    df_sample = pd.concat([df_sample, ret_df])

before: 6898
after: 3756
before: 1229
after: 1020
before: 8927
after: 3021
before: 570
after: 337
before: 5618
after: 1951
before: 16839
after: 4609
before: 2047
after: 801
before: 98
after: 81
before: 654
after: 405
before: 3070
after: 1530


In [21]:
df_sample.groupby(by='HS Code').count()['System Identity Id']

HS Code
871200    1000
871410    1000
871420    1000
871491     337
871492    1000
871493    1000
871494     801
871495      81
871496     405
871499    1000
Name: System Identity Id, dtype: int64

In [22]:
len(df_sample)

7624

In [23]:
df_sample.to_parquet('/data/common/trade_data/2019_updated/data_samples/sample_87128714.parq')