### Cleaning Script for 2016 - 2020 Data

In [1]:
import pandas as pd
import numpy as np
import cleaner as cln

#### data cleaning
0. feature clean up
1. process units (pending)
2. duplicate handling
3. process nulls
4. process dates

In [2]:
def cleaning_pipeline(file_path):
    
    df = pd.read_csv(file_path)
    print('Begin cleaning...')
      
    print('Feature clean up...')
    df = cln.process_columns(df)
    
    print('Cleaning duplicated rows...')
    df = cln.process_duplicated_rows(df)
    
    print('Cleaning units...')
    df = cln.process_units(df)
    
    print('Cleaning null values...')
    df = cln.cleanup_nulls(df)
    
    print('Cleaning dates...')
    df = cln.process_dates(df)

    return df

In [3]:
p = '../data/'
input_name = 'consolidated_data_2016_2020.csv'
output_name = 'cleaned.csv'

df = cleaning_pipeline(f'{p}{input_name}')

Begin cleaning...
Feature clean up...
Dropped cols: ['Unnamed: 0', 'subspecies', 'specific_name', 'generic_name']
Cleaning duplicated rows...
Number of duplicated rows 203,373 out of 2,093,505 total rows
Rows remaining 1,890,132
Cleaning units...
Cleaning null values...
Cleaning dates...


In [12]:
imf_data = "IMF_GDP_per_capita.csv"
imf = pd.read_csv(f'{p}{imf_data}', usecols=['Country','2020'])

In [13]:
ctry_key = "key_2016.csv"
ctry = pd.read_csv(f'{p}{ctry_key}', encoding="cp1252").drop([275,286,287,288,289,290,291])
ctry = ctry[ctry['field']=='country']

In [14]:
df_final = cln.add_gdp_values(df, imf, ctry)

In [15]:
df_final.to_csv(f'{p}{output_name}',index=False)

In [16]:
# process_units is hard coded at the moment

In [17]:
# species name and specific name can be combined?
# species_code with *

In [18]:
df_final.columns

Index(['control_number', 'species_code', 'genus', 'species', 'wildlf_desc',
       'wildlf_cat', 'cartons', 'qty', 'unit', 'value_x', 'ctry_org',
       'ctry_ie', 'purp', 'src', 'trans_mode', 'act', 'dp_cd', 'disp_date',
       'ship_date', 'i_e', 'pt_cd', 'specific_generic_name', 'unit_NO',
       'unit_KG', 'unit_LT', 'unit_MT', 'unit_M2', 'qty_new', 'ship_date_yyyy',
       'ship_date_mm', 'ctry_ie_name', '2020GDP'],
      dtype='object')

In [19]:
df_final.head()

Unnamed: 0,control_number,species_code,genus,species,wildlf_desc,wildlf_cat,cartons,qty,unit,value_x,...,unit_NO,unit_KG,unit_LT,unit_MT,unit_M2,qty_new,ship_date_yyyy,ship_date_mm,ctry_ie_name,2020GDP
0,2018280858,WDER,ODOCOILEUS,VIRGINIANUS,MEA,MAM,NaN_cartons,1.0,NO,500.0,...,True,False,False,False,False,1.0,2018,6,Mexico,8506.909
1,2018301780,BLBE,URSUS,AMERICANUS,TRO,MAM,NaN_cartons,1.0,NO,0.0,...,True,False,False,False,False,1.0,2018,6,Canada,43306.635
2,2018311221,BAEA,HALIAEETUS,LEUCOCEPHALUS,FEA,EGL,NaN_cartons,2.0,NO,0.0,...,True,False,False,False,False,2.0,2018,6,Mexico,8506.909
3,2018311221,FEHA,BUTEO,REGALIS,FEA,RAP,NaN_cartons,2.0,NO,0.0,...,True,False,False,False,False,2.0,2018,6,Mexico,8506.909
4,2018311221,BAOW,TYTO,ALBA,FEA,RAP,NaN_cartons,1.0,NO,0.0,...,True,False,False,False,False,1.0,2018,6,Mexico,8506.909


In [22]:
type(df_final['ship_date_mm'][0])

str