In [15]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import time

In [16]:
basePath = '/data/common/trade_data/2019_updated/us_customs_2019_cleaned_ignore_multiple_hscode_FULLDESC/parquet_by_month/'
os.listdir(basePath)

['US_Imp_Dec_2019_ignore_multiple_hscode.parq',
 'US_Imp_Aug_2019_ignore_multiple_hscode.parq',
 'US_Imp_Feb_2019_ignore_multiple_hscode.parq',
 'US_Imp_Mar_2019_ignore_multiple_hscode.parq',
 'US_Imp_Jun_2019_ignore_multiple_hscode.parq',
 'US_Imp_Oct_2019_ignore_multiple_hscode.parq',
 'US_Imp_Nov_2019_ignore_multiple_hscode.parq',
 'US_Imp_Jan_2019_ignore_multiple_hscode.parq',
 'US_Imp_Apr_2019_ignore_multiple_hscode.parq',
 'US_Imp_Jul_2019_ignore_multiple_hscode.parq',
 'US_Imp_May_2019_ignore_multiple_hscode.parq',
 'US_Imp_Sep_2019_ignore_multiple_hscode.parq']

### Load HS Code Chapters to Keep

In [17]:
df_hscode_chap = pd.read_csv('hs_code_chap_to_keep.csv')

In [18]:
hscode_tokeep = list(df_hscode_chap[df_hscode_chap['Keep?']=='Yes']['HS code'])

### Load HS Codes

In [19]:
#Load hs codes
hs_code = pd.read_csv('/data/common/trade_data/HS/hs_code_2019_final.csv', dtype='str')
hs_code['HS2'] = [int(x[0:2]) for x in hs_code['HS_Code']]
hs_code

Unnamed: 0,HS_Code,Merged_Description,HS2
0,010121,"Live horses, asses, mules and hinnies ;Horses ...",1
1,010129,"Live horses, asses, mules and hinnies ;Horses ...",1
2,010130,"Live horses, asses, mules and hinnies ;Asses",1
3,010190,"Live horses, asses, mules and hinnies ;Other ;...",1
4,010221,Live bovine animals ;Cattle ;Purebred breeding...,1
...,...,...,...
5516,991921,"Goods of Panama, under the terms of general no...",99
5517,991961,"Goods of Panama, under the terms of general no...",99
5518,992004,"Goods of Korea, under the terms of general not...",99
5519,992038,"Goods of Korea, under the terms of general not...",99


### Iterate through hs codes chapter

In [20]:
def sample_by_chapter(chap):
    #Get unique hs codes
    unique_hs_code = hs_code[hs_code['HS2']==chap]['HS_Code'].unique()
    
    #Create empty df 
    df_subset = None
    
    #Iterate through all parquet files by month
    for fn in os.listdir(basePath):
        tempdf = pd.read_parquet(basePath + fn)
        
        #Filter out unique hs codes
        tempdf = tempdf[tempdf['Cleaned_HS_Code'].isin(unique_hs_code)] 
        
        #Append to empty df
        if df_subset is None:
            df_subset =  tempdf
        else:
            df_subset = pd.concat([df_subset, tempdf], axis=0)
        
    #Create sample df
    df_sample = pd.DataFrame(columns = df_subset.columns)

    #Iterate through unique hs codes
    for h in unique_hs_code:
        #Iterate through each hs code
        tempdf = df_subset[df_subset['Cleaned_HS_Code']==h]
        #Drop duplicate product descriptions
        tempdf = tempdf.drop_duplicates('Product Desc')
        #Set sample size
        sample_size = 1000
        if(len(tempdf)<1000):
            sample_size = len(tempdf)
        #sample  hs code
        ret_df = tempdf.sample(n=sample_size, random_state=99)
        #concat samples by hscode together
        df_sample = pd.concat([df_sample, ret_df])
    
    print("Length of sample for chapter {} is {}".format(chap, len(df_sample)))
    df_sample.to_parquet('/data/common/trade_data/2019_updated/data_samples_ignore_multiple_hscode/sample_by_chapter/sample_chap_' + str(chap) + '.parq')

In [24]:
#for chap in hscode_tokeep[1:25]:
#    sample_by_chapter(chap)

Length of sample for chapter 10 is 7161
Length of sample for chapter 11 is 12887
Length of sample for chapter 12 is 13914
Length of sample for chapter 13 is 3876
Length of sample for chapter 14 is 2147
Length of sample for chapter 15 is 15361
Length of sample for chapter 16 is 13915
Length of sample for chapter 17 is 11224
Length of sample for chapter 18 is 9595
Length of sample for chapter 19 is 14786
Length of sample for chapter 20 is 32909
Length of sample for chapter 21 is 12925
Length of sample for chapter 22 is 20036
Length of sample for chapter 28 is 41813
Length of sample for chapter 29 is 104627
Length of sample for chapter 30 is 9999
Length of sample for chapter 31 is 5659
Length of sample for chapter 32 is 24678
Length of sample for chapter 33 is 20138
Length of sample for chapter 34 is 14619
Length of sample for chapter 35 is 9373
Length of sample for chapter 36 is 2531
Length of sample for chapter 37 is 6141
Length of sample for chapter 38 is 32576


In [None]:
#for chap in hscode_tokeep[25:50]:
#    sample_by_chapter(chap)

Length of sample for chapter 39 is 86885
Length of sample for chapter 40 is 47150
Length of sample for chapter 47 is 6153
Length of sample for chapter 48 is 63559


In [34]:
for chap in hscode_tokeep[50:]:
    sample_by_chapter(chap)

Length of sample for chapter 74 is 23301
Length of sample for chapter 75 is 4690
Length of sample for chapter 76 is 25770
Length of sample for chapter 78 is 1890
Length of sample for chapter 79 is 4530
Length of sample for chapter 80 is 2628
Length of sample for chapter 81 is 4508
Length of sample for chapter 82 is 45087
Length of sample for chapter 83 is 28301
Length of sample for chapter 84 is 283078
Length of sample for chapter 85 is 159238
Length of sample for chapter 86 is 10516
Length of sample for chapter 87 is 54530
Length of sample for chapter 88 is 6285
Length of sample for chapter 89 is 8052
Length of sample for chapter 90 is 61342
Length of sample for chapter 91 is 16889
Length of sample for chapter 92 is 9446
Length of sample for chapter 93 is 7230
Length of sample for chapter 94 is 35393
Length of sample for chapter 95 is 25180
Length of sample for chapter 96 is 37732
