In [1]:
#Import statements
import pandas as pd
import numpy as np
import re
import joblib
import os

In [2]:
codes_df = pd.read_csv('Lemis Data/key_2016.csv', encoding='ISO-8859-1')

In [3]:
import_path = 'Lemis Data/Imports 05-16/'
export_path = 'Lemis Data/Exports 05-16/'
imports = [f for f in os.listdir(import_path)]
exports = [f for f in os.listdir(export_path)]
lemis_imports_dict = {}
lemis_exports_dict = {}

for file in imports:
    year = re.findall(r'\d+', file)[0]
    import_df = pd.read_csv(f'{import_path}{file}', encoding='ISO-8859-1')
    lemis_imports_dict[file] = import_df
    
for file in exports:
    year = re.findall(r'\d+', file)[0]
    export_df = pd.read_csv(f'{export_path}{file}', encoding='ISO-8859-1')
    lemis_exports_dict[file] = export_df
    
lemis_import = pd.concat(lemis_imports_dict.values(), ignore_index=True)
lemis_export = pd.concat(lemis_exports_dict.values(), ignore_index=True)

In [4]:
#Shape
print(lemis_import.shape)
#Columns
lemis_import.columns

(4590303, 23)


Index(['Unnamed: 0', 'Species\r\r\nCode', 'Genus', 'Species', 'Sub Species',
       'Specific\r\r\nName', 'Generic\r\r\nName', 'Wildlf\r\r\nDesc', 'Unit',
       '# of Cartons', 'Ctry\r\r\nOrg', 'Ctry\r\r\nIE', 'Purp', 'Src', 'Act',
       'Dp\r\r\nCd', 'Disp\r\r\nDate', 'Ship\r\r\nDate', 'IE', 'Pt\r\r\nCd',
       'Trans Mode', 'U.S.Importer/\r\r\nExporter', 'X__1'],
      dtype='object')

In [5]:
#Shape
print(lemis_export.shape)
#Columns
lemis_export.columns

(942690, 23)


Index(['Unnamed: 0', 'Species\r\r\nCode', 'Genus', 'Species', 'Sub Species',
       'Specific\r\r\nName', 'Generic\r\r\nName', 'Wildlf\r\r\nDesc', 'Unit',
       '# of Cartons', 'Ctry\r\r\nOrg', 'Ctry\r\r\nIE', 'Purp', 'Src', 'Act',
       'Dp\r\r\nCd', 'Disp\r\r\nDate', 'Ship\r\r\nDate', 'IE', 'Pt\r\r\nCd',
       'Trans Mode', 'U.S.Importer/\r\r\nExporter', 'X__1'],
      dtype='object')

In [6]:
def CleanDataFrameColumns(dataframe):
    
    df = dataframe.copy()
    column_mapping = {'wildlf_desc': 'description', '# of cartons': 'measure_value', 'ctry_org': 'country_origin',
                      'ctry_ie': 'country_import_export', 'purp': 'purpose', 'src': 'source',
                      'act': 'action', 'dp_cd': 'disposition', 'disp_date': 'disposition_date', 
                      'ship_date': 'shipment_date', 'ie': 'import_export', 'pt_cd': 'port', 
                      'trans mode': 'transport_mode', 'u.s.importer/_exporter': 'us_co', 'sub species': 'sub_species'}
    drop_cols = ['Unnamed: 0', 'X__1']
    df = df.drop(columns=drop_cols)
    df.columns = df.columns.str.replace('\r', ' ')
    df.columns = df.columns.str.replace(r'\n',  '', regex=True)
    df.columns = df.columns.str.replace('  ', '_')
    df.columns = df.columns.str.lower()
    df = df.rename(columns=column_mapping) 
    
    df = df.replace(r'[*]+', np.nan, regex=True)
    df['unit'] = df['unit'].str.upper()    
    df['country_origin'] = df['country_origin'].str.upper()
    df['country_origin'] = df['country_origin'].fillna('XX')
    df['country_import_export'] = df['country_import_export'].str.upper()
    df['country_import_export'] = df['country_import_export'].fillna('XX')
    
    return df

In [7]:
lemis_import_cleaned = CleanDataFrameColumns(lemis_import)
lemis_export_cleaned = CleanDataFrameColumns(lemis_export)

In [8]:
lemis_import_cleaned.head()

Unnamed: 0,species_code,genus,species,sub_species,specific_name,generic_name,description,unit,measure_value,country_origin,...,purpose,source,action,disposition,disposition_date,shipment_date,import_export,port,transport_mode,us_co
0,PCMR,PINCTADA,MARTENSII,,,OYSTER,SHE,NO,12397.0,JP,...,T,W,C,C,2004-12-29,2005-01-02,I,SF,O,KORET OF CALIFORNIA
1,PHAC,PHASIANUS,COLCHICUS,,COMMON,PHEASANT,MEA,KG,350.0,CA,...,T,C,C,C,2004-12-30,2005-01-03,I,PH,T,VAN VOOREN GAME RANCH INC.
2,MACR,MACROPUS,RUFUS,,RED,KANGAROO,SHO,NO,76.0,AU,...,T,W,C,C,2004-12-30,2005-01-02,I,SE,O,"CABELA'S, INC."
3,MACR,MACROPUS,RUFUS,,RED,KANGAROO,SHO,NO,70.0,AU,...,T,W,C,C,2004-12-30,2005-01-02,I,SE,O,"CABELA'S, INC."
4,ANGJ,ANGUILLA,JAPONICA,,JAPANESE,EEL,LIV,KG,7.0,CN,...,T,C,C,C,2004-12-31,2005-01-01,I,SP,A,TIANSHUN LOGISTICS SERVICES CO LTD


In [9]:
lemis_export_cleaned.head()

Unnamed: 0,species_code,genus,species,sub_species,specific_name,generic_name,description,unit,measure_value,country_origin,...,purpose,source,action,disposition,disposition_date,shipment_date,import_export,port,transport_mode,us_co
0,,STASINA,SAETOSA,,,,SPE,NO,1.0,DO,...,S,W,C,C,2004-12-16,2005-01-05,E,BO,A,"Museum of Comparative Zoology, Harvard University"
1,,STASINA,MACLEAYI,,,,SPE,NO,1.0,CU,...,S,W,C,C,2004-12-16,2005-01-05,E,BO,A,"Museum of Comparative Zoology, Harvard University"
2,,STASINA,LUCASI,,,,SPE,NO,1.0,CU,...,S,W,C,C,2004-12-16,2005-01-05,E,BO,A,"Museum of Comparative Zoology, Harvard University"
3,,SPARIANTHUS,SELENOPOIDES,,,,SPE,NO,1.0,PA,...,S,W,C,C,2004-12-16,2005-01-05,E,BO,A,"Museum of Comparative Zoology, Harvard University"
4,,PSEUDOSPARIANTHUS,CUBANA,,,,SPE,NO,1.0,CU,...,S,W,C,C,2004-12-16,2005-01-05,E,BO,A,"Museum of Comparative Zoology, Harvard University"


In [10]:
lemis_import_cleaned['disposition_date'] = lemis_import_cleaned['disposition_date'].replace('NA', '1/1/2999')
lemis_import_cleaned['shipment_date'] = lemis_import_cleaned['shipment_date'].replace('NA', '1/1/2999')
lemis_import_cleaned[lemis_import_cleaned.columns[~lemis_import_cleaned.columns.isin(['country_origin','country_import_export'])]] = lemis_import_cleaned[lemis_import_cleaned.columns[~lemis_import_cleaned.columns.isin(['country_origin','country_import_export'])]].replace('NA', '')  

lemis_export_cleaned['disposition_date'] = lemis_export_cleaned['disposition_date'].replace('NA', '1/1/2999')
lemis_export_cleaned['shipment_date'] = lemis_export_cleaned['shipment_date'].replace('NA', '1/1/2999')
lemis_export_cleaned[lemis_export_cleaned.columns[~lemis_export_cleaned.columns.isin(['country_origin','country_import_export'])]] = lemis_export_cleaned[lemis_export_cleaned.columns[~lemis_export_cleaned.columns.isin(['country_origin','country_import_export'])]].replace('NA', '')  

In [11]:
#Function to create a dataframe for each column in codes to join on
def createCodeDataframe(col, new_name):
    df = codes_df[codes_df['field'] == col].rename(columns={'value': new_name})
    return df

# Looping through the code dataframes to join onto the main lemis import and export df
codes_list = ['action', 'description', 'disposition', 'port', 'purpose', 'source', 'unit', 'transport_mode']
for code in codes_list:
    
    code_df = createCodeDataframe(code, f'{code}_value')
    if code=='unit':
        code_df['code'] = code_df['code'].str.upper()   
    
    lemis_import_cleaned = lemis_import_cleaned.set_index(code)
    lemis_export_cleaned = lemis_export_cleaned.set_index(code)
    
    code_df = code_df.set_index('code').drop(columns='field')
    
    lemis_import_cleaned = lemis_import_cleaned.join(code_df, how='left')
    lemis_export_cleaned = lemis_export_cleaned.join(code_df, how='left')

country_origin_df = createCodeDataframe('country', 'country_origin_value')
country_origin_df['code'] = country_origin_df['code'].str.upper()
country_origin_df = country_origin_df.set_index('code').drop(columns='field')

lemis_import_cleaned = lemis_import_cleaned.set_index('country_origin')
lemis_import_cleaned = lemis_import_cleaned.join(country_origin_df, how='left')
lemis_export_cleaned = lemis_export_cleaned.set_index('country_origin')
lemis_export_cleaned = lemis_export_cleaned.join(country_origin_df, how='left')

country_import_export_df = createCodeDataframe('country', 'country_import_export_value')
country_import_export_df = country_import_export_df.set_index('code').drop(columns='field')

lemis_import_cleaned = lemis_import_cleaned.set_index('country_import_export')
lemis_import_cleaned = lemis_import_cleaned.join(country_import_export_df, how='left')
lemis_export_cleaned = lemis_export_cleaned.set_index('country_import_export')
lemis_export_cleaned = lemis_export_cleaned.join(country_import_export_df, how='left')

lemis_import_cleaned = lemis_import_cleaned.reset_index(drop=True)
lemis_export_cleaned = lemis_export_cleaned.reset_index(drop=True)

In [12]:
lemis_import_cleaned.head()

Unnamed: 0,species_code,genus,species,sub_species,specific_name,generic_name,measure_value,disposition_date,shipment_date,import_export,...,action_value,description_value,disposition_value,port_value,purpose_value,source_value,unit_value,transport_mode_value,country_origin_value,country_import_export_value
0,ELKK,CERVUS,ELAPHUS,,,ELK,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
1,CEDA,DAMA,DAMA,,FALLOW,DEER,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
2,OVAR,OVIS,ARIES,,DOMESTIC,SHEEP,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
3,ANCE,ANTILOPE,CERVICAPRA,,,BLACKBUCK,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
4,CEAX,AXIS,AXIS,,,CHITAL,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra


In [13]:
lemis_import_cleaned.shape, lemis_export_cleaned.shape

((5553659, 21), (1300882, 21))

In [14]:
lemis = lemis_import_cleaned.append(lemis_export_cleaned, ignore_index=True)

In [15]:
lemis.shape

(6854541, 21)

In [16]:
lemis.columns

Index(['species_code', 'genus', 'species', 'sub_species', 'specific_name',
       'generic_name', 'measure_value', 'disposition_date', 'shipment_date',
       'import_export', 'us_co', 'action_value', 'description_value',
       'disposition_value', 'port_value', 'purpose_value', 'source_value',
       'unit_value', 'transport_mode_value', 'country_origin_value',
       'country_import_export_value'],
      dtype='object')

In [17]:
lemis.dtypes

species_code                    object
genus                           object
species                         object
sub_species                     object
specific_name                   object
generic_name                    object
measure_value                  float64
disposition_date                object
shipment_date                   object
import_export                   object
us_co                           object
action_value                    object
description_value               object
disposition_value               object
port_value                      object
purpose_value                   object
source_value                    object
unit_value                      object
transport_mode_value            object
country_origin_value            object
country_import_export_value     object
dtype: object

In [18]:
lemis.isna().sum()

species_code                     84543
genus                           347313
species                         643761
sub_species                    6614675
specific_name                  2339709
generic_name                    425426
measure_value                  1263580
disposition_date                     0
shipment_date                        0
import_export                        0
us_co                             3342
action_value                         0
description_value                  773
disposition_value                   79
port_value                          75
purpose_value                      986
source_value                      4047
unit_value                       12242
transport_mode_value            473354
country_origin_value              1397
country_import_export_value       9859
dtype: int64

In [20]:
#Creating label for if any of these columns are known or not and fixing nulls
for col in ['genus', 'species_code', 'species', 'sub_species', 'specific_name', 'generic_name', 'us_co', 'action_value',
            'description_value', 'disposition_value', 'port_value', 'purpose_value', 'source_value', 'unit_value', 'transport_mode_value']:
    lemis[col] = np.where(lemis[col].isna(), f'unknown_{col}', lemis[col])
    
lemis['measure_value'] = lemis['measure_value'].fillna(999999)

In [21]:
lemis['disposition_date'] = lemis['disposition_date'].str.replace('/', '-')
lemis['shipment_date'] = lemis['shipment_date'].str.replace('/', '-')
lemis[['disposition_date','shipment_date']] = lemis[['disposition_date','shipment_date']].apply(pd.to_datetime)

In [22]:
lemis.head()

Unnamed: 0,species_code,genus,species,sub_species,specific_name,generic_name,measure_value,disposition_date,shipment_date,import_export,...,action_value,description_value,disposition_value,port_value,purpose_value,source_value,unit_value,transport_mode_value,country_origin_value,country_import_export_value
0,ELKK,CERVUS,ELAPHUS,unknown_sub_species,unknown_specific_name,ELK,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
1,CEDA,DAMA,DAMA,unknown_sub_species,FALLOW,DEER,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
2,OVAR,OVIS,ARIES,unknown_sub_species,DOMESTIC,SHEEP,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
3,ANCE,ANTILOPE,CERVICAPRA,unknown_sub_species,unknown_specific_name,BLACKBUCK,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra
4,CEAX,AXIS,AXIS,unknown_sub_species,unknown_specific_name,CHITAL,1.0,2012-12-11,2012-12-02,I,...,Cleared,Trophy (all the parts of one animal),Cleared,Atlanta,Hunting Trophies,Specimens taken from the wild,Number of Specimens,Air cargo,Argentina,Andorra


In [23]:
lemis.isna().sum()

species_code                      0
genus                             0
species                           0
sub_species                       0
specific_name                     0
generic_name                      0
measure_value                     0
disposition_date                  0
shipment_date                     0
import_export                     0
us_co                             0
action_value                      0
description_value                 0
disposition_value                 0
port_value                        0
purpose_value                     0
source_value                      0
unit_value                        0
transport_mode_value              0
country_origin_value           1397
country_import_export_value    9859
dtype: int64

In [24]:
lemis['unit_value'].value_counts()

Number of Specimens    6421568
Kilograms               331077
Milliliters              48246
Grams                    35900
unknown_unit_value       12242
Square Meters             1742
Milligrams                1356
Liters                    1294
Meters                     553
Centimeters                280
Cubic Centimeters          149
Square Centimeter           74
Cubic Meters                60
Name: unit_value, dtype: int64

In [25]:
lemis['species'].value_counts()

SPECIES               708912
unknown_species       643761
MAXIMA                304179
MISSISSIPPIENSIS      212869
NILOTICUS             170417
                       ...  
ANOLIS GARMANI             1
ANOLIS VALENCIENNI         1
ANOLIS GRAHAMI             1
PRAESIGNIS                 1
HEEMSTRAORUM               1
Name: species, Length: 16740, dtype: int64

In [26]:
lemis['generic_name'].value_counts()

CORAL                   618014
OYSTER                  463247
unknown_generic_name    425426
SHELL                   323163
ALLIGATOR               212878
                         ...  
JEWFISH                      1
OLIVEBACK                    1
MUSTARD                      1
PAINTBRUSH                   1
PIKEPERCH                    1
Name: generic_name, Length: 2253, dtype: int64

In [27]:
cols_index = ['species_code', 'genus', 'species', 'sub_species', 
             'specific_name', 'generic_name', 'disposition_date', 
             'shipment_date', 'import_export', 'transport_mode_value', 
             'us_co', 'action_value', 'description_value', 
             'disposition_value', 'port_value', 'purpose_value', 
             'source_value', 'country_origin_value', 'country_import_export_value']
lemis_pivot = lemis.pivot_table(index=cols_index, columns='unit_value', values='measure_value').reset_index()
lemis_pivot.columns = lemis_pivot.columns.str.replace(' ', '_')
lemis_pivot.columns = lemis_pivot.columns.str.lower()
lemis_pivot.columns.name = None

measure_value_cols = ['centimeters', 'cubic_centimeters', 'cubic_meters', 'grams', 'kilograms', 
                      'liters', 'meters', 'milligrams', 'milliliters', 'number_of_specimens', 
                      'square_centimeter', 'square_meters', 'unknown_unit_value']

lemis_pivot[measure_value_cols] = lemis_pivot[measure_value_cols].replace(999999, np.nan)

In [28]:
lemis_pivot.head(20)

Unnamed: 0,species_code,genus,species,sub_species,specific_name,generic_name,disposition_date,shipment_date,import_export,transport_mode_value,...,grams,kilograms,liters,meters,milligrams,milliliters,number_of_specimens,square_centimeter,square_meters,unknown_unit_value
0,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-01-13,2005-01-05,I,unknown_transport_mode_value,...,,,,,,,1157.0,,,
1,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-01-18,2005-01-18,E,Personal vehicle,...,,,,,,,,,,
2,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-01-26,2005-01-12,I,unknown_transport_mode_value,...,,,,,,,,,,
3,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-01-26,2005-01-17,I,unknown_transport_mode_value,...,,,,,,,,,,
4,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-02-15,2005-02-15,E,Personal vehicle,...,,,,,,,,,,
5,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-03-11,2005-03-11,E,Personal vehicle,...,,,,,,,,,,
6,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-03-18,2005-03-15,I,unknown_transport_mode_value,...,,,,,,,15.0,,,
7,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-03-18,2005-03-15,I,unknown_transport_mode_value,...,,,,,,,15.0,,,
8,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-03-31,2005-03-31,E,Personal vehicle,...,,,,,,,,,,
9,AAA?,ASTRAEA,SPECIES,unknown_sub_species,unknown_specific_name,SHELL,2005-04-12,2005-04-08,E,Mail,...,,,,,,,1.0,,,


In [29]:
lemis_pivot.isna().sum()

species_code                         0
genus                                0
species                              0
sub_species                          0
specific_name                        0
generic_name                         0
disposition_date                     0
shipment_date                        0
import_export                        0
transport_mode_value                 0
us_co                                0
action_value                         0
description_value                    0
disposition_value                    0
port_value                           0
purpose_value                        0
source_value                         0
country_origin_value                 0
country_import_export_value          0
centimeters                    5569385
cubic_centimeters              5569600
cubic_meters                   5569583
grams                          5544479
kilograms                      5328366
liters                         5568655
meters                   

In [30]:
lemis_pivot.shape

(5569636, 32)

In [31]:
lemis_pivot.to_csv('lemis_imports_exports.csv')