In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import time

### Load FULL DESC data for 2019 imports

In [19]:
!ls /data/common/trade_data/2019_updated/us_customs_2019_cleaned_ignore_multiple_hscode_FULLDESC

us_customs_2019_ignore_multiple_hscode_not_null_hscode.parq
us_customs_2019_ignore_multiple_hscode_null_hscode.parq


In [3]:
#Load full dataset without nulls
df_no_null = dd.read_parquet('/data/common/trade_data/2019_updated/us_customs_2019_cleaned_ignore_multiple_hscode_FULLDESC/us_customs_2019_ignore_multiple_hscode_not_null_hscode.parq', engine='fastparquet', chunksize="100MB")

In [None]:
len(df_no_null)

### Load HS Codes
Note: we want HS4 here for 8712 and 8714

In [4]:
#Load hs codes
hs_code = pd.read_csv('/data/common/trade_data/HS/hs_code_2019_final.csv', dtype='str')
hs_code['HTS4'] = [x[0:4] for x in hs_code['HS_Code']]
hs_code

Unnamed: 0,HS_Code,Merged_Description,HTS4
0,010121,"Live horses, asses, mules and hinnies ;Horses ...",0101
1,010129,"Live horses, asses, mules and hinnies ;Horses ...",0101
2,010130,"Live horses, asses, mules and hinnies ;Asses",0101
3,010190,"Live horses, asses, mules and hinnies ;Other ;...",0101
4,010221,Live bovine animals ;Cattle ;Purebred breeding...,0102
...,...,...,...
5516,991921,"Goods of Panama, under the terms of general no...",9919
5517,991961,"Goods of Panama, under the terms of general no...",9919
5518,992004,"Goods of Korea, under the terms of general not...",9920
5519,992038,"Goods of Korea, under the terms of general not...",9920


### Find all HS6 codes within subset of HS4 codes and filter full data for 2019

In [5]:
def subset_full_df_by_hts4_code(hscodes, hts4codes_to_filter=[]):
    unique_hs_code = hs_code[hs_code['HTS4'].isin(hts4codes_to_filter)]['HS_Code'].unique()
    print("There are {} unique hs6 codes in these hts4 code groups.".format(len(unique_hs_code)))
    tempdf = df_no_null[df_no_null['Cleaned_HS_Code'].isin(unique_hs_code)] 
    return tempdf

In [6]:
#Specify hs codes
specific_hscodes = ['8712','8714']

#List of unique HS Codes within the HS4 codes specified
unique_hs_code = hs_code[hs_code['HTS4'].isin(specific_hscodes)]['HS_Code'].unique()

In [7]:
#Call function passing in specific hs codes to filter on
df_subset = subset_full_df_by_hts4_code(hs_code, specific_hscodes)

There are 10 unique hs6 codes in these hts4 code groups.


In [None]:
#df_subset.head()

In [8]:
#Convert to pandas
df_subset_pandas = df_subset.compute()

### Sample subsetted df

In [9]:
#Create an empty pandas dataframe for the sample
df_sample = pd.DataFrame(columns = df_subset_pandas.columns)

In [10]:
#Loop through unique hs code list and sample 1000 from each hs code
for h in unique_hs_code:
    #print(h)
    tempdf = df_subset_pandas[df_subset_pandas['Cleaned_HS_Code']==h]
    print("before:", len(tempdf))
    tempdf = tempdf.drop_duplicates('Product Desc')
    print("after:", len(tempdf))
    sample_size = 1000
    if(len(tempdf)<1000):
        sample_size = len(tempdf)
    ret_df = tempdf.sample(n=sample_size, random_state=99)
    df_sample = pd.concat([df_sample, ret_df])

before: 6893
after: 3751
before: 1204
after: 995
before: 8922
after: 3016
before: 567
after: 334
before: 5611
after: 1945
before: 16836
after: 4606
before: 2046
after: 800
before: 95
after: 78
before: 654
after: 405
before: 3063
after: 1524


In [11]:
df_sample.groupby(by='HS Code').count()['System Identity Id']

HS Code
871200    1000
871410     995
871420    1000
871491     334
871492    1000
871493    1000
871494     800
871495      78
871496     405
871499    1000
Name: System Identity Id, dtype: int64

In [12]:
len(df_sample)

7612

In [16]:
df_sample.to_parquet('/data/common/trade_data/2019_updated/data_samples_ignore_multiple_hscode/sample_ignore_multiple_hscode_87128714.parq')

In [18]:
df_sample[['HS Code', 'Cleaned_HS_Code']].to_csv('temp.csv')