In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

## Package imports and settings 

In [1]:
import pandas as pd
import seaborn as sns
import datetime as dt
import cpi
import requests
import re
from unidecode import unidecode

In [2]:
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

In [3]:
from thefuzz import fuzz
from thefuzz import process


In [4]:
# Right after importing seaborn (could also use 'whitegrid')
sns.set_theme(style='whitegrid', context='talk')

In [5]:
# surpress scientific notation
pd.options.display.float_format = '{:.2f}'.format

# Importing Data

## Lobster Export Data

### Which codes do relate to lobsters?

From : https://www.ic.gc.ca/app/scr/tdst/tdo/crtr.html?grouped=INDIVIDUAL&searchType=KS_CS&naArea=9999&countryList=ALL&toFromCountry=CDN&reportType=TE&timePeriod=5%7CComplete+Years&currency=CDN&productType=HS6&hSelectedCodes=%7C10600%7C30611%7C30612%7C30615%7C30621%7C30622%7C30625%7C30631%7C30632%7C30634%7C30691%7C30692%7C30694%7C160530&runReport=true

Searched for all codes containing word "lobster". Extracted all codes within HS 1605 and HS 0306.

In [6]:
# jlab
path_1605 = 'data/full_data/1605_88Jan_to_23Sep_ATP_PV_Monthly.csv'
df_1605_full = pd.read_csv(path_1605, engine = 'python', skipfooter = 3,index_col=False, header = 1)
df_1605_full.head()

Unnamed: 0,Period,Commodity,Province,Country,State,Value ($),Quantity,Unit of measure
0,2013-04-01,"1605.21.00 - Shrimps and prawns, prepared or p...",Newfoundland and Labrador,United States,California,195111,29802,Weight in kilograms
1,2013-12-01,"1605.21.00 - Shrimps and prawns, prepared or p...",Newfoundland and Labrador,United States,California,285427,35966,Weight in kilograms
2,2017-10-01,"1605.59.00 - Molluscs, prepared or preserved, nes",Newfoundland and Labrador,United States,California,185609,22500,Weight in kilograms
3,2014-08-01,"1605.10.91 - Crab, snow (Queen), prepared or p...",Newfoundland and Labrador,United States,Alabama,191769,14969,Weight in kilograms
4,2013-01-01,"1605.21.00 - Shrimps and prawns, prepared or p...",Newfoundland and Labrador,United States,California,170470,16053,Weight in kilograms


In [7]:
path_0306 = 'data/full_data/0306-Crust-Jan88-to-Sep23-prov.csv'
df_0306_full = pd.read_csv(path_0306, engine = 'python', skipfooter = 3,index_col=False, header = 1)
df_0306_full.head()

Unnamed: 0,Period,Commodity,Province,Country,State,Value ($),Quantity,Unit of measure
0,2014-06-01,"0306.16.10 - Cold-water shrimps and prawns, fr...",Newfoundland and Labrador,Greenland,,220915,66528,Weight in kilograms
1,2023-04-01,"0306.14.10 - Crabs, snow (Queen), frozen",Newfoundland and Labrador,United States,California,1111067,61916,Weight in kilograms
2,2015-01-01,"0306.16.10 - Cold-water shrimps and prawns, fr...",Newfoundland and Labrador,Greenland,,753558,174576,Weight in kilograms
3,2015-06-01,"0306.16.10 - Cold-water shrimps and prawns, fr...",Newfoundland and Labrador,Greenland,,883171,199392,Weight in kilograms
4,2015-07-01,"0306.17.10 - Shrimps and prawns, frozen, in sh...",Newfoundland and Labrador,Saint Pierre and Miquelon,,704,45,Weight in kilograms


## Inflation

In [8]:
# from fred, index 2015, 100
inflation_data = pd.read_csv('data/CPI-data-fred.csv')
inflation_data = inflation_data.rename(columns = {'DATE': 'Period', 'CPALCY01CAM661N':'cpi_val'})
inflation_data['Period'] = pd.to_datetime(inflation_data['Period'])

In [9]:
inflation_data

Unnamed: 0,Period,cpi_val
0,1988-01-01,56.93
1,1988-02-01,57.33
2,1988-03-01,57.49
3,1988-04-01,57.65
4,1988-05-01,57.97
...,...,...
424,2023-05-01,124.51
425,2023-06-01,125.07
426,2023-07-01,125.23
427,2023-08-01,126.03


# 1. Cleaning
---
---

### Merge 1605 and 0306

In [10]:
# concat function
def concat_dfs(df_list):
    df = pd.concat(df_list, axis = 0, ignore_index = True)
    return df

In [11]:
full_data = concat_dfs([df_1605_full,df_0306_full])


## Initial Cleaning Steps

In [12]:
# perform cleaning set on combined data set
def cleaning_steps(df_name):
    # make lower case
    df_name['Commodity'] = df_name['Commodity'].str.lower()
    # keep only commodities that mention 'lobster'
    df_name = df_name[df_name['Commodity'].str.contains('lobster')].reset_index(drop=True)
    # drop extra columns
    df_name = df_name.drop(columns = ['Unit of measure'])
    # rename columns
    new_col_names = {'Value ($)': 'Value', 'Quantity': 'Quantity_kg'}
    df_name = df_name.rename(columns=new_col_names)
    # update period to datetime object
    df_name['Period'] = pd.to_datetime(df_name['Period'])
    return df_name



In [13]:
data_clean = cleaning_steps(full_data)
data_clean

Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059
1,2018-06-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2199419,48158
2,2014-06-01,"1605.30.11 - lobster meat, frozen, cooked by s...",Newfoundland and Labrador,United States,Massachusetts,1299983,38486
3,2014-10-01,"1605.30.11 - lobster meat, frozen, cooked by s...",Newfoundland and Labrador,United States,Massachusetts,151542,3475
4,2014-11-01,"1605.30.11 - lobster meat, frozen, cooked by s...",Newfoundland and Labrador,United States,Massachusetts,634072,22628
...,...,...,...,...,...,...,...
57380,2006-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Northwest Territories,Hong Kong,,6000,273
57381,2007-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Northwest Territories,Japan,,7200,408
57382,2007-03-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Northwest Territories,"Korea, South",,10080,572
57383,2006-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nunavut,Czechia,,3000,136


### Adjust value to today's dollars

In [14]:

def inflation_adjustment(df_name, cpi_df):
    ''' adjust monetary features into today's dolars, index 2015 = 100'''
    df_name = df_name.merge(cpi_df, how='inner', on='Period')
    today_cpi = cpi_df.loc[cpi_df['Period'] == cpi_df['Period'].max(), 'cpi_val'].item()
    # create new column to adjust to today's dollars
    # return df_name
    df_name['Value-adjusted'] = (df_name['Value'] / df_name['cpi_val']) * today_cpi
    return df_name
    
    

In [15]:
data_clean = inflation_adjustment(data_clean, inflation_data)
data_clean

Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg,cpi_val,Value-adjusted
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059,102.17,3023516.55
1,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",New Brunswick,United States,Massachusetts,20137016,370393,102.17,24808298.71
2,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,California,31113,1633,102.17,38330.44
3,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Quebec,United States,California,98603,2347,102.17,121476.42
4,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,New Hampshire,310914,15676,102.17,383038.25
...,...,...,...,...,...,...,...,...,...
57380,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",British Columbia,Hong Kong,,81553,4538,59.65,172082.30
57381,1989-02-01,"0306.12.10 - lobsters in brine, (homarus spp),...",Prince Edward Island,Sweden,,107235,8340,59.65,226273.05
57382,1989-02-01,"0306.22.90 - lobsters,(homarus spp), not froze...",Nova Scotia,United States,Massachusetts,16048,1316,59.65,33862.36
57383,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nova Scotia,United States,Unknown states,124924,8381,59.65,263598.02


## Country Name Cleaning

### Clean Country Name in export data first 
before running match

In [16]:
data_clean[["Country", 'Terminated']]= data_clean["Country"].str.split(r'(',expand=True)
data_clean

Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg,cpi_val,Value-adjusted,Terminated
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059,102.17,3023516.55,
1,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",New Brunswick,United States,Massachusetts,20137016,370393,102.17,24808298.71,
2,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,California,31113,1633,102.17,38330.44,
3,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Quebec,United States,California,98603,2347,102.17,121476.42,
4,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,New Hampshire,310914,15676,102.17,383038.25,
...,...,...,...,...,...,...,...,...,...,...
57380,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",British Columbia,Hong Kong,,81553,4538,59.65,172082.30,
57381,1989-02-01,"0306.12.10 - lobsters in brine, (homarus spp),...",Prince Edward Island,Sweden,,107235,8340,59.65,226273.05,
57382,1989-02-01,"0306.22.90 - lobsters,(homarus spp), not froze...",Nova Scotia,United States,Massachusetts,16048,1316,59.65,33862.36,
57383,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nova Scotia,United States,Unknown states,124924,8381,59.65,263598.02,


In [17]:
data_clean["Country"] = data_clean["Country"].str.strip()

In [18]:
export_country_list = pd.Series(data_clean['Country'].unique())
export_country_list = export_country_list.sort_values(ignore_index= True)
export_country_list[10:30]

10                             Barbados
11                              Belarus
12                              Belgium
13                               Belize
14                              Bermuda
15                              Bolivia
16     Bonaire, Sint Eustatius and Saba
17                               Brazil
18                             Bulgaria
19                         Burkina Faso
20                           Cabo Verde
21                             Cambodia
22                       Cayman Islands
23             Central African Republic
24                                Chile
25                                China
26                     Christmas Island
27                                Cocos
28                             Colombia
29    Congo, Democratic Republic of the
dtype: object

In [19]:
export_country_list.to_csv('data/export_countries.csv')

In [20]:
# export clean full data for reuse

data_clean.to_csv('data/data_full_clean.csv')

## Get GDP Data for each year and country GDP Using WBAPI

In [21]:
import wbgapi as wb
import wbdata
import pandas as pd
from countrycode import countrycode
import numpy as np
from currency_converter import CurrencyConverter

In [22]:
# list of countries that need GDP data for 
countries = pd.read_csv('data/export_countries.csv', index_col=0, header = 1, names = ['Country'])

In [23]:
countries[10:30]

Unnamed: 0,Country
11,Belarus
12,Belgium
13,Belize
14,Bermuda
15,Bolivia
16,"Bonaire, Sint Eustatius and Saba"
17,Brazil
18,Bulgaria
19,Burkina Faso
20,Cabo Verde


In [25]:
gdp_indicator = 'NY.GDP.MKTP.CD'
country_code = ['PRK', 'TWN']

In [28]:
country_names = countries.iloc[:, 0].to_list()
country_names[:20]

['Albania',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Bahamas',
 'Bahrain',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Bermuda',
 'Bolivia',
 'Bonaire, Sint Eustatius and Saba',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Cabo Verde']

In [29]:
# using world bank API, match country names. WBAPI will match on several types of country names that reuquire GDP data For
data = wb.economy.coder(country_names)
data.items()

dict_items([('Albania', 'ALB'), ('Antigua and Barbuda', 'ATG'), ('Argentina', 'ARG'), ('Armenia', 'ARM'), ('Aruba', 'ABW'), ('Australia', 'AUS'), ('Austria', 'AUT'), ('Bahamas', 'BHS'), ('Bahrain', 'BHR'), ('Barbados', 'BRB'), ('Belarus', 'BLR'), ('Belgium', 'BEL'), ('Belize', 'BLZ'), ('Bermuda', 'BMU'), ('Bolivia', 'BOL'), ('Bonaire, Sint Eustatius and Saba', None), ('Brazil', 'BRA'), ('Bulgaria', 'BGR'), ('Burkina Faso', 'BFA'), ('Cabo Verde', 'CPV'), ('Cambodia', 'KHM'), ('Cayman Islands', 'CYM'), ('Central African Republic', 'CAF'), ('Chile', 'CHL'), ('China', 'CHN'), ('Christmas Island', None), ('Cocos', None), ('Colombia', 'COL'), ('Congo, Democratic Republic of the', 'COD'), ('Costa Rica', 'CRI'), ('Cuba', 'CUB'), ('Cyprus', 'CYP'), ('Czechia', 'CZE'), ('Czechoslovakia', None), ("Côte d'Ivoire", 'CIV'), ('Denmark', 'DNK'), ('Djibouti', 'DJI'), ('Dominica', 'DMA'), ('Dominican Republic', 'DOM'), ('East Germany', 'DEU'), ('Ecuador', 'ECU'), ('Egypt', 'EGY'), ('Estonia', 'EST'), ('

In [None]:
print(dir(data))

['__class__', '__class_getitem__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_coder_report', '_repr_html_', 'clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']


In [33]:
data.keys()

dict_keys(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Bahamas', 'Bahrain', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bermuda', 'Bolivia', 'Bonaire, Sint Eustatius and Saba', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cayman Islands', 'Central African Republic', 'Chile', 'China', 'Christmas Island', 'Cocos', 'Colombia', 'Congo, Democratic Republic of the', 'Costa Rica', 'Cuba', 'Cyprus', 'Czechia', 'Czechoslovakia', "Côte d'Ivoire", 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'East Germany', 'Ecuador', 'Egypt', 'Estonia', 'Eswatini', 'Fiji', 'Finland', 'Former Union of Soviet Socialist Republics', 'France', 'French Polynesia', 'French Southern Antarctic Territories', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Greenland', 'Guadeloupe', 'Guam', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Heard Island and McDonald Islands', 'Honduras', 'Hong Kong', '

In [34]:
# extract only values (country codes)
data.values()

dict_values(['ALB', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'BHS', 'BHR', 'BRB', 'BLR', 'BEL', 'BLZ', 'BMU', 'BOL', None, 'BRA', 'BGR', 'BFA', 'CPV', 'KHM', 'CYM', 'CAF', 'CHL', 'CHN', None, None, 'COL', 'COD', 'CRI', 'CUB', 'CYP', 'CZE', None, 'CIV', 'DNK', 'DJI', 'DMA', 'DOM', 'DEU', 'ECU', 'EGY', 'EST', 'SWZ', 'FJI', 'FIN', None, 'FRA', 'PYF', None, None, 'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GRC', 'GRL', None, 'GUM', 'GTM', 'GIN', 'GNB', 'HTI', None, 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LBN', 'LSO', 'LUX', 'MAC', 'MWI', 'MYS', 'MDV', 'MLT', None, 'MUS', 'MEX', 'MDA', 'MNG', 'MAR', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NLD', 'NZL', 'NER', None, 'NOR', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'POL', 'PRT', 'QAT', 'ROU', 'RUS', None, None, 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SVK', 'SVN', 'ZAF', 'SSD', 'ESP', 'LKA', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', None, 

In [35]:
# store country codes in a list 
country_codes = list(data.values())
# country_codes 

In [36]:
# using country codes, get annual GDP by year for the relevant years in data 
gdp_data = wb.data.DataFrame(gdp_indicator, country_codes, time=range(1988, 2023))

In [37]:
gdp_data

Unnamed: 0_level_0,YR1988,YR1989,YR1990,YR1991,YR1992,YR1993,YR1994,YR1995,YR1996,YR1997,...,YR2013,YR2014,YR2015,YR2016,YR2017,YR2018,YR2019,YR2020,YR2021,YR2022
economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,596648044.69,695530726.26,764804469.27,872067039.11,958659217.88,1083240223.46,1245810055.87,1320670391.06,1379888268.16,1531843575.42,...,2727932960.89,2791061452.51,2963128491.62,2983798882.68,3092178770.95,3276187709.50,3395793854.75,2610038938.55,3126019385.47,
ALB,2051236250.00,2253090000.00,2028553750.00,1099559027.78,652174990.84,1185315468.46,1880950857.79,2392764853.42,3199640825.62,2258513974.10,...,12776224537.27,13228147516.12,11386853143.15,11861199830.84,13019726211.74,15156424061.98,15401826080.52,15162734205.25,17930565118.82,18882095517.88
ARE,36275674203.21,41464995913.92,50701443748.30,51552165622.45,54239171887.77,55625170253.34,59305093979.84,65743666575.86,73571233996.19,78839008444.57,...,400218529748.13,414105366752.89,370275469571.14,369255326235.53,390516804029.95,427049432157.93,417989721742.68,349473015330.16,415021590687.54,507534921715.45
ARG,126890235049.09,76629728760.12,141352630146.73,189719984268.48,228778994288.21,236741715015.02,257440000000.00,258031750000.00,272149750000.00,292859000000.00,...,552025140252.25,526319673731.64,594749285413.21,557532317363.45,643628396190.39,524819898586.91,447754686715.08,385540406815.60,487227125385.64,632770284408.51
ARM,,,2256838857.61,2069870129.87,1272835453.43,1201312829.28,1315158636.68,1468317435.20,1596968946.24,1639492444.76,...,11121464429.70,11609513241.08,10553337527.77,10546136240.40,11527458712.62,12457940705.31,13619290545.81,12641698593.79,13861409968.83,19502783987.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USA,5236438000000.00,5641580000000.00,5963144000000.00,6158129000000.00,6520327000000.00,6858559000000.00,7287236000000.00,7639749000000.00,8073122000000.00,8577554457000.00,...,16843190993000.00,17550680174000.00,18206020741000.00,18695110842000.00,19477336549000.00,20533057312000.00,21380976119000.00,21060473613000.00,23315080560000.00,25462700000000.00
UZB,,,13360607990.68,13677622222.22,12941297376.09,13099013835.51,12899156990.62,13350468917.41,13948892215.57,14744603773.58,...,73180037913.03,80845385816.21,86196264741.94,86138288633.22,62081322741.12,52870108217.56,60283503705.39,60224701295.80,69600614988.60,80391853884.76
VEN,60226413793.10,43536709104.01,48606952194.78,53453444786.63,60416519619.79,60037460783.41,58418666666.67,77389487771.55,70543211119.10,85837678559.25,...,371005379786.57,482359318767.70,,,,,,,,
VNM,25423812648.59,6293304974.59,6471740805.57,9613369520.42,9866990236.44,13180953598.17,16286433533.32,20736164458.95,24657470574.75,26843700441.55,...,213708830776.95,233451484773.97,239258340825.53,257096001174.08,281353626107.24,310106472642.97,334365257920.29,346615750166.55,366137590717.80,408802379068.23


In [38]:
type(gdp_data)

pandas.core.frame.DataFrame

In [39]:
# export gdp data to csv for easier use later
gdp_data.to_csv('data/gdp_data.csv', index = False)

In [40]:
# convert GDP to CAD? 


### WBAPI regions 
(not same as continent.)

In [43]:
economy_data = wb.economy.DataFrame()
economy_data

Unnamed: 0_level_0,name,aggregate,longitude,latitude,region,adminregion,lendingType,incomeLevel,capitalCity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ABW,Aruba,False,-70.02,12.52,LCN,,LNX,HIC,Oranjestad
AFE,Africa Eastern and Southern,True,,,,,,,
AFG,Afghanistan,False,69.18,34.52,SAS,SAS,IDX,LIC,Kabul
AFW,Africa Western and Central,True,,,,,,,
AGO,Angola,False,13.24,-8.81,SSF,SSA,IBD,LMC,Luanda
...,...,...,...,...,...,...,...,...,...
XKX,Kosovo,False,20.93,42.56,ECS,ECA,IDX,UMC,Pristina
YEM,"Yemen, Rep.",False,44.21,15.35,MEA,MNA,IDX,LIC,Sana'a
ZAF,South Africa,False,28.19,-25.75,SSF,SSA,IBD,UMC,Pretoria
ZMB,Zambia,False,28.29,-15.40,SSF,SSA,IDX,LMC,Lusaka


In [44]:
economy_data = economy_data.reset_index(drop=False)
economy_data = economy_data[['id', 'name', 'region']]
economy_data

Unnamed: 0,id,name,region
0,ABW,Aruba,LCN
1,AFE,Africa Eastern and Southern,
2,AFG,Afghanistan,SAS
3,AFW,Africa Western and Central,
4,AGO,Angola,SSF
...,...,...,...
261,XKX,Kosovo,ECS
262,YEM,"Yemen, Rep.",MEA
263,ZAF,South Africa,SSF
264,ZMB,Zambia,SSF


In [45]:
## QA
# all countries and regions in wb api

wb_countries = wb.economy.info()
type(wb_countries)
wb_countries

id,value,region,incomeLevel
ABW,Aruba,LCN,HIC
AFE,Africa Eastern and Southern,,
AFG,Afghanistan,SAS,LIC
AFW,Africa Western and Central,,
AGO,Angola,SSF,LMC
ALB,Albania,ECS,UMC
AND,Andorra,ECS,HIC
ARB,Arab World,,
ARE,United Arab Emirates,MEA,HIC
ARG,Argentina,LCN,UMC


### Mapping Countries - code, continent

In [46]:
# get continent for each country for visuals. 
continent_list = []
for country_code in country_codes:
    if country_code is None:
        continent_list.append(np.nan)
    else:
        continent_value = countrycode(country_code, origin = "iso3c", destination = "continent")
        continent_list.append(continent_value)

In [47]:
continent_list[:10]

['Europe',
 'Americas',
 'Americas',
 'Asia',
 'Americas',
 'Oceania',
 'Europe',
 'Americas',
 'Asia',
 'Americas']

In [48]:
# add contient to countries df
countries['continent'] = continent_list
countries

Unnamed: 0,Country,continent
1,Albania,Europe
2,Antigua and Barbuda,Americas
3,Argentina,Americas
4,Armenia,Asia
5,Aruba,Americas
...,...,...
155,Uzbekistan,Asia
156,Venezuela,Americas
157,Viet Nam,Asia
158,West Germany,Europe


In [49]:
# add country code to contries df for easier mapping
countries['country_code'] = country_codes
countries[:20]

Unnamed: 0,Country,continent,country_code
1,Albania,Europe,ALB
2,Antigua and Barbuda,Americas,ATG
3,Argentina,Americas,ARG
4,Armenia,Asia,ARM
5,Aruba,Americas,ABW
6,Australia,Oceania,AUS
7,Austria,Europe,AUT
8,Bahamas,Americas,BHS
9,Bahrain,Asia,BHR
10,Barbados,Americas,BRB


In [52]:
# check taiwan 
countries[countries['Country'].str.contains('Taiwan|Hong|Macao')]
# hong kong


# north korea

Unnamed: 0,Country,continent,country_code
67,Hong Kong,Asia,HKG
90,Macao,Asia,MAC
138,Taiwan,Asia,TWN


In [50]:
## Cross reference - drop countries if not significant 

no_gdp_countries = countries[countries['country_code'].isna()]

In [51]:
countries.to_csv('data/country_code_mapping.csv', index = False)

In [53]:
no_gdp_countries['Country']

Unnamed: 0,Country,continent,country_code
16,"Bonaire, Sint Eustatius and Saba",,
26,Christmas Island,,
27,Cocos,,
34,Czechoslovakia,,
47,Former Union of Soviet Socialist Republics,,
50,French Southern Antarctic Territories,,
51,French Southern Territories,,
59,Guadeloupe,,
65,Heard Island and McDonald Islands,,
95,Martinique,,


In [85]:
data_clean.loc[data_clean['Country'].isin(no_gdp_countries['Country']), 'Country'].unique()

array(['Christmas Island', 'Saint Pierre and Miquelon',
       'Bonaire, Sint Eustatius and Saba', 'Guadeloupe',
       'Former Union of Soviet Socialist Republics', 'Czechoslovakia',
       'Saint Helena, Ascension and Tristan da Cunha', 'Martinique',
       'French Southern Antarctic Territories',
       'French Southern Territories', 'Tokelau',
       'Union of Soviet Socialist Republics', 'Yugoslavia',
       'Heard Island and McDonald Islands', 'Cocos', 'Norfolk Island'],
      dtype=object)

In [89]:
no_gdp_countries

Unnamed: 0,Country,continent,country_code
16,"Bonaire, Sint Eustatius and Saba",,
26,Christmas Island,,
27,Cocos,,
34,Czechoslovakia,,
47,Former Union of Soviet Socialist Republics,,
50,French Southern Antarctic Territories,,
51,French Southern Territories,,
59,Guadeloupe,,
65,Heard Island and McDonald Islands,,
95,Martinique,,


In [94]:
years_list =  list(set(data_clean['Period'].dt.year.unique()))
# years_list

In [107]:
# percent of total exports by year
result = pd.DataFrame()
for country in no_gdp_countries['Country']:
    subtotal = data_clean.loc[data_clean['Country'].str.contains(country),:]
    subtotal['Year'] = data_clean['Period'].dt.year
    subtotal = pd.DataFrame(subtotal[['Country', 'Value-adjusted', 'Year']].groupby(['Country', 'Year']).sum())
    # print(subtotal)
    pd.concat([result, subtotal])
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subtotal['Year'] = data_clean['Period'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subtotal['Year'] = data_clean['Period'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subtotal['Year'] = data_clean['Period'].dt.year
A value is trying to be set on a copy of a slice from a DataF

In [66]:
data_clean.columns

Index(['Period', 'Commodity', 'Province', 'Country', 'State', 'Value',
       'Quantity_kg', 'cpi_val', 'Value-adjusted', 'Terminated'],
      dtype='object')

In [68]:
data_clean.dtypes

Period            datetime64[ns]
Commodity                 object
Province                  object
Country                   object
State                     object
Value                      int64
Quantity_kg                int64
cpi_val                  float64
Value-adjusted           float64
Terminated                object
dtype: object

In [81]:
spm = data_clean.loc[data_clean['Country'].str.contains('Miquelon|France|Hong'),['Country', 'Value-adjusted']]
spm.groupby(['Country']).sum()

Unnamed: 0_level_0,Value-adjusted
Country,Unnamed: 1_level_1
France,1461566094.12
Hong Kong,844425205.23
Saint Pierre and Miquelon,2748679.05


# bin


### 2. EDA on Full Data Set
---
---

In [42]:
# surpress scientific notation
pd.options.display.float_format = '{:,.2f}'.format

In [43]:
full_data['Province'].value_counts()

Province
Nova Scotia                  37769
New Brunswick                21260
Newfoundland and Labrador    16195
British Columbia             14299
Prince Edward Island         13276
Quebec                       10953
Ontario                       3957
Alberta                        213
Manitoba                       108
Northwest Territories           87
Saskatchewan                    57
Nunavut                         13
Yukon                            1
Name: count, dtype: int64

In [44]:
# null values in state are for countries other than US
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118188 entries, 0 to 118187
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Period           118188 non-null  object
 1   Commodity        118188 non-null  object
 2   Province         118188 non-null  object
 3   Country          118188 non-null  object
 4   State            61407 non-null   object
 5   Value ($)        118188 non-null  int64 
 6   Quantity         118188 non-null  int64 
 7   Unit of measure  118188 non-null  object
dtypes: int64(2), object(6)
memory usage: 7.2+ MB


In [45]:
full_data.loc[(full_data['Country'] == 'United States'), 'State'].value_counts()

State
Massachusetts           15574
California               6084
Maine                    4942
New York                 3295
New Hampshire            3278
Washington               3220
Florida                  3196
New Jersey               2892
Rhode Island             2210
Illinois                 1809
Texas                    1156
Connecticut              1148
Virginia                 1127
Georgia                  1081
Oregon                    966
Pennsylvania              960
Hawaii                    866
Nevada                    786
Michigan                  761
Indiana                   676
Maryland                  668
Kentucky                  441
Ohio                      386
Delaware                  382
Unknown states            343
Missouri                  325
North Carolina            320
Tennessee                 299
Colorado                  274
Arizona                   262
South Carolina            259
Minnesota                 201
Louisiana                 199
Alas

In [46]:
exports = data_clean.groupby('Country').mean('Value-adjusted')
exports.sort_values(by='Quantity_kg', ascending=False).head(20)


Unnamed: 0_level_0,Value,Quantity_kg,cpi_val,Value-adjusted
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,2141458.44,110934.39,98.9,2477098.72
United States,1112436.35,45622.47,86.96,1511284.55
"Korea, South",791065.87,37568.93,93.59,976595.98
Belgium,584665.72,34325.64,87.34,813991.73
Japan,585955.01,30087.7,83.27,849206.71
Netherlands,508753.01,29671.4,87.82,709382.99
France,379473.88,28943.85,85.65,578379.93
Norfolk Island,154000.0,25262.0,79.11,245028.34
Spain,420377.82,22720.22,93.81,515795.37
Viet Nam,348923.81,22699.46,101.56,426214.71


In [47]:
# store in a list
top_countries = list(exports.sort_values(by='Quantity_kg', ascending=False).head(20).index)
top_countries

['China',
 'United States',
 'Korea, South',
 'Belgium',
 'Japan',
 'Netherlands',
 'France',
 'Norfolk Island',
 'Spain',
 'Viet Nam',
 'Hong Kong',
 'United Kingdom',
 'Italy',
 'Pakistan',
 'Germany',
 'Sierra Leone',
 'Taiwan',
 'French Southern Territories',
 'Denmark',
 'Sweden']

In [48]:
data_clean.groupby('Country').mean('Quantity_kg').sort_values(by='Quantity_kg', ascending = False)

Unnamed: 0_level_0,Value,Quantity_kg,cpi_val,Value-adjusted
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,2141458.44,110934.39,98.90,2477098.72
United States,1112436.35,45622.47,86.96,1511284.55
"Korea, South",791065.87,37568.93,93.59,976595.98
Belgium,584665.72,34325.64,87.34,813991.73
Japan,585955.01,30087.70,83.27,849206.71
...,...,...,...,...
Cocos,2100.00,95.00,85.03,3108.47
Myanmar,855.00,57.00,106.73,1008.30
Tokelau,641.00,48.00,60.53,1332.87
Costa Rica,628.00,30.50,89.00,887.21


In [49]:
pd.DataFrame(data_clean['Country'].value_counts())

Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
United States,27364
France,2527
Japan,2245
Hong Kong,1841
Belgium,1828
...,...
"Moldova,",1
"Congo, Democratic",1
Mauritius,1
"Tanzania, United",1


In [50]:
pd.DataFrame(data_clean['Commodity'].value_counts())

Unnamed: 0_level_0,count
Commodity,Unnamed: 1_level_1
"0306.22.10 - lobsters, (homarus spp), live (terminated 2016-12)",17998
"0306.12.90 - lobsters, (homarus spp), nes, frozen, in shell, including boiled in shell",6824
"0306.12.90 - lobsters, nes, frozen",6379
"0306.32.10 - lobsters, live",5110
"1605.30.11 - lobster meat, frozen, simply boiled in water (terminated 2016-12)",4141
"0306.22.10 - lobsters, live (terminated 2016-12)",4132
"0306.12.10 - lobsters in brine, (homarus spp), frozen, in shell, including boiled in shell",3591
"0306.12.10 - lobsters, in brine, frozen",1676
"1605.30.10 - lobster, prepared or preserved, frozen",1441
"1605.30.11 - lobster meat, frozen, cooked by steaming or boiling in water (terminated 2016-12)",1016


In [51]:
filtered_df = data_clean.loc[data_clean['Country'].isin(top_countries),:]
filtered_df

Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg,cpi_val,Value-adjusted,Terminated
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059,102.17,3023516.55,
1,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",New Brunswick,United States,Massachusetts,20137016,370393,102.17,24808298.71,
2,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,California,31113,1633,102.17,38330.44,
3,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Quebec,United States,California,98603,2347,102.17,121476.42,
4,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,New Hampshire,310914,15676,102.17,383038.25,
...,...,...,...,...,...,...,...,...,...,...
57380,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",British Columbia,Hong Kong,,81553,4538,59.65,172082.30,
57381,1989-02-01,"0306.12.10 - lobsters in brine, (homarus spp),...",Prince Edward Island,Sweden,,107235,8340,59.65,226273.05,
57382,1989-02-01,"0306.22.90 - lobsters,(homarus spp), not froze...",Nova Scotia,United States,Massachusetts,16048,1316,59.65,33862.36,
57383,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nova Scotia,United States,Unknown states,124924,8381,59.65,263598.02,


In [52]:
filtered_df

Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg,cpi_val,Value-adjusted,Terminated
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059,102.17,3023516.55,
1,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",New Brunswick,United States,Massachusetts,20137016,370393,102.17,24808298.71,
2,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,California,31113,1633,102.17,38330.44,
3,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Quebec,United States,California,98603,2347,102.17,121476.42,
4,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,New Hampshire,310914,15676,102.17,383038.25,
...,...,...,...,...,...,...,...,...,...,...
57380,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",British Columbia,Hong Kong,,81553,4538,59.65,172082.30,
57381,1989-02-01,"0306.12.10 - lobsters in brine, (homarus spp),...",Prince Edward Island,Sweden,,107235,8340,59.65,226273.05,
57382,1989-02-01,"0306.22.90 - lobsters,(homarus spp), not froze...",Nova Scotia,United States,Massachusetts,16048,1316,59.65,33862.36,
57383,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nova Scotia,United States,Unknown states,124924,8381,59.65,263598.02,


In [53]:
cols = ['Period', 'Country', 'Value($)']
filtered_df = filtered_df[['Period','Commodity', 'Country', 'Quantity', 'Value ($)']]


KeyError: "['Quantity', 'Value ($)'] not in index"

In [None]:
data

In [None]:
sns.scatterplot(x=data['Period'], y=data['Value ($)'], alpha = 0.2)


dont need... got relevant HS codes another way

### Match country names in gdp list to export country list - BIN???

In [27]:
# filter GDP data by countries in export country list:
gdp_data_filtered = gdp_data[gdp_data['Country Name'].isin(export_country_list)].reset_index(drop=True)
gdp_data_filtered

Unnamed: 0,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,,,,,,,,,,...,2791061452.51,2963128491.62,2983798882.68,3092178770.95,3276187709.50,3395793854.75,2610038938.55,3126019385.47,,
1,Afghanistan,537777811.11,548888895.56,546666677.78,751111191.11,800000044.44,1006666637.78,1399999966.67,1673333417.78,1373333366.67,...,20550582746.84,19998143635.87,18019554403.45,18896353155.88,18418860354.42,18904502222.21,20143451705.75,14583135236.57,,
2,Albania,,,,,,,,,,...,13228147516.12,11386853143.15,11861199830.84,13019726211.74,15156424061.98,15401826080.52,15162734205.25,17930565118.82,18882095517.88,
3,United Arab Emirates,,,,,,,,,,...,414105366752.89,370275469571.14,369255326235.53,390516804029.95,427049432157.93,417989721742.68,349473015330.16,415021590687.54,507534921715.45,
4,Argentina,,,24450604877.61,18272123664.47,25605249381.76,28344705966.64,28630474727.90,24256667553.26,26436857247.50,...,526319673731.64,594749285413.21,557532317363.45,643628396190.39,524819898586.91,447754686715.08,385540406815.60,487227125385.64,632770284408.51,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Ukraine,,,,,,,,,,...,133503867181.97,91030968761.42,93355868977.66,112090503817.40,130891086689.52,153883045524.93,156617719786.25,199765856764.50,160502739236.10,
116,Uruguay,1242289239.20,1547388781.43,1710004407.23,1539681490.78,1975701816.47,1890767155.53,1809185093.55,1597713469.10,1593674184.56,...,61496186973.90,57680328702.24,57480788380.31,65006047680.32,65203071110.60,62048585618.50,53666908479.56,61412268248.95,71177146197.50,
117,United States,543300000000.00,563300000000.00,605100000000.00,638600000000.00,685800000000.00,743700000000.00,815000000000.00,861700000000.00,942500000000.00,...,17550680174000.00,18206020741000.00,18695110842000.00,19477336549000.00,20533057312000.00,21380976119000.00,21060473613000.00,23315080560000.00,25462700000000.00,
118,Uzbekistan,,,,,,,,,,...,80845385816.21,86196264741.94,86138288633.22,62081322741.12,52870108217.56,60283503705.39,60224701295.80,69600614988.60,80391853884.76,


In [54]:
# unmatched countries 
unmatched = list(data_clean.loc[~data_clean['Country'].isin(gdp_data['Country Name']), 'Country'].unique())
unmatched = pd.DataFrame(data=unmatched, columns=['unmatched_export_countries'])

In [55]:
unmatched

Unnamed: 0,unmatched_export_countries
0,Taiwan
1,Macao
2,Christmas Island
3,United States Minor Outlying Islands
4,"South Africa,"
5,"Moldova,"
6,Central African
7,Saint Pierre And Miquelon
8,Dominican
9,Bahamas


In [30]:
pd.concat([unmatched, gdp_country_list], axis = 1)

Unnamed: 0,unmatched_export_countries,gdp_country
0,Hong Kong,Afghanistan
1,"Korea, South",Africa Eastern and Southern
2,Taiwan,Africa Western and Central
3,Gambia,Albania
4,Macao,Algeria
...,...,...
261,,West Bank and Gaza
262,,World
263,,"Yemen, Rep."
264,,Zambia


In [31]:
# def country_match(countries_to_match, country_options):
result_df = pd.DataFrame(columns = ['country_to_match','best_match', 'all_matches'] )
for country_index in range(len(unmatched)):
        country_to_match = unmatched.iloc[country_index,0]
        collection = gdp_country_list.iloc[:,0].to_list()
        best_matches = process.extract(country_to_match, collection, scorer=fuzz.ratio)
        new_row = {'country_to_match': country_to_match, 'best_match':best_matches[0][0], 'all_matches':str(best_matches)}
        # print(pd.DataFrame(data = [new_row]))
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

        # result_df = result_df.append(country_to_match, best_matches[0], best_matches, ignore_index = True)
result_df

Unnamed: 0,country_to_match,best_match,all_matches
0,Hong Kong,"Hong Kong SAR, China","[('Hong Kong SAR, China', 62), ('Mongolia', 47..."
1,"Korea, South","Korea, Rep.","[('Korea, Rep.', 64), ('Lesotho', 53), ('Afric..."
2,Taiwan,Thailand,"[('Thailand', 71), ('Tajikistan', 62), ('Botsw..."
3,Gambia,Zambia,"[('Zambia', 83), ('Namibia', 77), ('Cambodia',..."
4,Macao,Monaco,"[('Monaco', 73), ('Curacao', 67), ('Jamaica', ..."
5,Christmas Island,Cayman Islands,"[('Cayman Islands', 67), ('Channel Islands', 6..."
6,Türkiye,Turkiye,"[('Turkiye', 86), ('Austria', 43), ('Eritrea',..."
7,United States Minor Outlying Islands,British Virgin Islands,"[('British Virgin Islands', 55), ('Northern Ma..."
8,"South Africa, Republic of",Central African Republic,"[('Central African Republic', 69), ('South Afr..."
9,"Moldova, Republic of",Slovak Republic,"[('Slovak Republic', 74), ('Dominican Republic..."


In [32]:

bannedWords = ['republic', 'rep', 'of', 'the', ',']

def RemoveBannedWords(toPrint, banned_words):
    # statement = toPrint
    pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, banned_words)) + r')\b', re.I)
    return pattern.sub("", toPrint)

# # toPrint = unmatched.loc[unmatched['unmatched_export_countries'].str.contains('tanzania'), 'unmatched_export_countries'].items()
# print(type(toPrint))
# print(toPrint)
# print(RemoveBannedWords(toPrint, bannedWords))

In [33]:
def country_cleaning(df_name, df_column):
    # print(df_name)
    # print(df_name[df_column])
    # df_name.loc[:,df_column] = df_name[df_column].lower()
    df_name[df_column] = df_name[df_column].apply(lambda x: x.lower())
    # df_name[df_column] = df_name[df_column].apply(lambda x: x.replace('of', ''))
    df_name[df_column] = df_name[df_column].apply(lambda x: RemoveBannedWords(x, bannedWords))
    df_name[df_column] = df_name[df_column].apply(lambda x: unidecode(x))
    df_name[df_column] = df_name[df_column].apply(lambda x: x.title())
    return df_name


In [34]:
country_cleaning(data_clean,'Country')


Unnamed: 0,Period,Commodity,Province,Country,State,Value,Quantity_kg,cpi_val,Value-adjusted,Terminated
0,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Newfoundland and Labrador,United States,Massachusetts,2454203,42059,102.17,3023516.55,
1,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",New Brunswick,United States,Massachusetts,20137016,370393,102.17,24808298.71,
2,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,California,31113,1633,102.17,38330.44,
3,2017-05-01,"1605.30.10 - lobster, prepared or preserved, f...",Quebec,United States,California,98603,2347,102.17,121476.42,
4,2017-05-01,"1605.30.90 - lobster, prepared or preserved, nes",Prince Edward Island,United States,New Hampshire,310914,15676,102.17,383038.25,
...,...,...,...,...,...,...,...,...,...,...
57380,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",British Columbia,Hong Kong,,81553,4538,59.65,172082.30,
57381,1989-02-01,"0306.12.10 - lobsters in brine, (homarus spp),...",Prince Edward Island,Sweden,,107235,8340,59.65,226273.05,
57382,1989-02-01,"0306.22.90 - lobsters,(homarus spp), not froze...",Nova Scotia,United States,Massachusetts,16048,1316,59.65,33862.36,
57383,1989-02-01,"0306.22.10 - lobsters, (homarus spp), live (te...",Nova Scotia,United States,Unknown states,124924,8381,59.65,263598.02,


In [68]:
# manually update important countries for mapping
gdp_data.loc[gdp_data['Country Name'].str.contains('Korea, Rep.'), 'Country Name'] = 'Korea, South'
gdp_data.loc[gdp_data['Country Name'].str.contains('Hong'), 'Country Name'] = 'Hong Kong'
gdp_data.loc[gdp_data['Country Name'].str.contains('Gambia'), 'Country Name'] = 'Gambia'
gdp_data.loc[gdp_data['Country Name'].str.contains('Egypt'), 'Country Name'] = 'Egypt'
gdp_data.loc[gdp_data['Country Name'].str.contains('Egypt'), 'Country Name'] = 'Macao'
# collapse east /west germany into germany in data clean
data_clean.loc[data_clean['Country'].str.contains('Germany'), 'Country'] = 'Germany'

In [69]:
# def country_match(countries_to_match, country_options):
result_df = pd.DataFrame(columns = ['country_to_match','best_match', 'all_matches'] )
for country_index in range(len(unmatched)):
        country_to_match = unmatched.iloc[country_index,0].title()
        collection = gdp_country_list.iloc[:,0].to_list()
        best_matches = process.extract(country_to_match, collection, scorer=fuzz.ratio)[0]
        new_row = {'country_to_match': country_to_match, 'best_match':best_matches[0], 'all_matches':str(best_matches)}
        # print(pd.DataFrame(data = [new_row]))
        result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

        # result_df = result_df.append(country_to_match, best_matches[0], best_matches, ignore_index = True)
result_df

Unnamed: 0,country_to_match,best_match,all_matches
0,Taiwan,Thailand,"('Thailand', 71)"
1,Macao,Monaco,"('Monaco', 73)"
2,Christmas Island,Cayman Islands,"('Cayman Islands', 67)"
3,United States Minor Outlying Islands,British Virgin Islands,"('British Virgin Islands', 55)"
4,"South Africa,",South Africa,"('South Africa', 100)"
5,"Moldova,",Moldova,"('Moldova', 100)"
6,Central African,Central African Republic,"('Central African Republic', 77)"
7,Saint Pierre And Miquelon,Sierra Leone,"('Sierra Leone', 54)"
8,Dominican,Dominica,"('Dominica', 94)"
9,Bahamas,"Bahamas, The","('Bahamas, The', 74)"


In [71]:
# QA checks:
# data_clean.loc[data_clean['Country'].str.contains('Gambia')]
# gdp_data.loc[gdp_data['Country Name'].str.contains('Germany')]
# gdp_data.loc[gdp_data['Country Name'].str.contains('Hong')]
gdp_data.loc[gdp_data['Country Name'].str.contains('Macao')]
# data_clean.loc[data_clean['Country'].str.contains('Germany'), 'Country'].unique()

Unnamed: 0,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
67,Macao,,,,,,4948667540.41,5278005611.91,5605484298.98,5932242990.65,...,305595408895.27,329366576819.41,332441717791.41,248362771739.13,262588632526.73,318678815489.75,383817841547.1,424671765455.7,476747720364.74,
146,"Macao SAR, China",,,,,,,,,,...,54902831793.5,45048188247.65,45070807279.63,50440935654.01,55284309690.27,55204960999.59,25459685019.55,30123795336.89,21979475560.62,


In [67]:
unmatched['unmatched_export_countries'] = unmatched['unmatched_export_countries'].str.title()
unmatched

Unnamed: 0,unmatched_export_countries
0,Taiwan
1,Macao
2,Christmas Island
3,United States Minor Outlying Islands
4,"South Africa,"
5,"Moldova,"
6,Central African
7,Saint Pierre And Miquelon
8,Dominican
9,Bahamas


In [39]:
data_clean.loc[~data_clean['Country'].isin(gdp_data['Country Name']), 'Country'].value_counts()

Country
Taiwan                                          816
Saint Pierre And Miquelon                       222
Korea, North                                     65
Macao                                            36
Guadeloupe                                       26
South Africa,                                    20
Christmas Island                                 11
Martinique                                       10
Czechoslovakia                                   10
Former Union  Soviet Socialist Republics         10
Netherlands Antilles                              9
United States Minor Outlying Islands              9
Trinidad And Tobago                               5
Dominican                                         4
Union  Soviet Socialist Republics                 4
Bahamas                                           3
Iran                                              3
Saint Helena, Ascension And Tristan Da Cunha      3
Bonaire, Sint Eustatius And Saba                  2
Kyrg

In [40]:
# unmatched countries again
unmatched = list(data_clean.loc[~data_clean['Country'].isin(gdp_data_filtered['Country Name']), 'Country'].unique())
unmatched = pd.DataFrame(data=unmatched, columns=['unmatched_export_countries'])
unmatched

Unnamed: 0,unmatched_export_countries
0,Hong Kong
1,"Korea, South"
2,Taiwan
3,Gambia
4,Macao
5,Christmas Island
6,Turkiye
7,United States Minor Outlying Islands
8,"South Africa,"
9,"Moldova,"


In [41]:
df = data_clean.loc[data_clean['Country'].isin(unmatched['unmatched_export_countries']),:]
df[['Country', 'Value']].groupby('Country').sum().sort_values(by='Value', ascending = False)

Unnamed: 0_level_0,Value
Country,Unnamed: 1_level_1
"Korea, South",1075849587
Hong Kong,644390181
Taiwan,164067902
"Korea, North",5324601
Gambia,2130278
Turkiye,1551305
Saint Pierre And Miquelon,1512152
Macao,1084879
"South Africa,",802516
Christmas Island,723587


## Country GDP first attempt

In [8]:
gdp_data = pd.read_csv('data/gdp-data/gdp_data.csv',engine= 'python', header = 2)
gdp_data

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2791061452.51,2963128491.62,2983798882.68,3092178770.95,3276187709.50,3395793854.75,2610038938.55,3126019385.47,,
1,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,21125015452.20,21616228139.04,23506279899.97,28048360188.28,25920665259.89,29472103269.74,...,1006526390112.83,927348469903.11,885176429223.51,1021043200767.01,1007196197587.41,1000834077088.55,927593321647.66,1081998139192.47,1169483734191.46,
2,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777811.11,548888895.56,546666677.78,751111191.11,800000044.44,1006666637.78,...,20550582746.84,19998143635.87,18019554403.45,18896353155.88,18418860354.42,18904502222.21,20143451705.75,14583135236.57,,
3,Africa Western and Central,AFW,GDP (current US$),NY.GDP.MKTP.CD,10447637852.92,11173212080.39,11990534017.87,12727688165.45,13898109284.02,14929792387.52,...,894322482190.28,768644740597.12,691363412188.46,684898755570.72,767025741475.04,822538393591.45,786460035395.03,844459722152.99,877863284874.07,
4,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,137244439121.37,87219300384.51,49840491178.15,68972769395.63,77792944471.95,69309110145.77,50241368243.63,65685435100.50,106713618735.43,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,7074392884.05,6295845465.36,6682674062.96,7180768692.26,7878762812.20,7899741278.94,7717143395.26,9412034299.23,9429156201.96,
262,"Yemen, Rep.",YEM,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,43228585321.33,42444489460.94,31317824906.44,26842231204.80,21606161066.21,,,,,
263,South Africa,ZAF,GDP (current US$),NY.GDP.MKTP.CD,8748596504.06,9225996313.29,9813996078.33,10854195662.66,11955995222.38,13068994777.63,...,381198869776.11,346709790458.56,323585509674.48,381448814653.46,404158882459.02,388531226582.35,337619569570.51,419015636064.71,405869718462.34,
264,Zambia,ZMB,GDP (current US$),NY.GDP.MKTP.CD,713000000.00,696285714.29,693142857.14,718714285.71,839428571.43,1082857142.86,...,27141023558.08,21251216798.78,20958412538.31,25873601260.84,26311506435.06,23308667781.23,18110638619.37,22147649568.61,29784454055.94,


In [9]:
gdp_data.drop(columns= ['Country Code', 'Indicator Name', 'Indicator Code'], inplace=True)

In [10]:
gdp_data

Unnamed: 0,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,,,,,,,,,,...,2791061452.51,2963128491.62,2983798882.68,3092178770.95,3276187709.50,3395793854.75,2610038938.55,3126019385.47,,
1,Africa Eastern and Southern,21125015452.20,21616228139.04,23506279899.97,28048360188.28,25920665259.89,29472103269.74,32014368121.37,33269509510.46,36327785494.93,...,1006526390112.83,927348469903.11,885176429223.51,1021043200767.01,1007196197587.41,1000834077088.55,927593321647.66,1081998139192.47,1169483734191.46,
2,Afghanistan,537777811.11,548888895.56,546666677.78,751111191.11,800000044.44,1006666637.78,1399999966.67,1673333417.78,1373333366.67,...,20550582746.84,19998143635.87,18019554403.45,18896353155.88,18418860354.42,18904502222.21,20143451705.75,14583135236.57,,
3,Africa Western and Central,10447637852.92,11173212080.39,11990534017.87,12727688165.45,13898109284.02,14929792387.52,15910837741.97,14510579888.72,14968235781.96,...,894322482190.28,768644740597.12,691363412188.46,684898755570.72,767025741475.04,822538393591.45,786460035395.03,844459722152.99,877863284874.07,
4,Angola,,,,,,,,,,...,137244439121.37,87219300384.51,49840491178.15,68972769395.63,77792944471.95,69309110145.77,50241368243.63,65685435100.50,106713618735.43,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,,,,,,,,,,...,7074392884.05,6295845465.36,6682674062.96,7180768692.26,7878762812.20,7899741278.94,7717143395.26,9412034299.23,9429156201.96,
262,"Yemen, Rep.",,,,,,,,,,...,43228585321.33,42444489460.94,31317824906.44,26842231204.80,21606161066.21,,,,,
263,South Africa,8748596504.06,9225996313.29,9813996078.33,10854195662.66,11955995222.38,13068994777.63,14211394321.13,15821393677.77,17124793156.93,...,381198869776.11,346709790458.56,323585509674.48,381448814653.46,404158882459.02,388531226582.35,337619569570.51,419015636064.71,405869718462.34,
264,Zambia,713000000.00,696285714.29,693142857.14,718714285.71,839428571.43,1082857142.86,1264285714.29,1368000000.00,1605857142.86,...,27141023558.08,21251216798.78,20958412538.31,25873601260.84,26311506435.06,23308667781.23,18110638619.37,22147649568.61,29784454055.94,


In [11]:
gdp_country_list = pd.DataFrame(data=gdp_data['Country Name'].unique(), columns = ['gdp_country'])
gdp_country_list = gdp_country_list.sort_values(by='gdp_country', ignore_index= True)
gdp_country_list

Unnamed: 0,gdp_country
0,Afghanistan
1,Africa Eastern and Southern
2,Africa Western and Central
3,Albania
4,Algeria
...,...
261,West Bank and Gaza
262,World
263,"Yemen, Rep."
264,Zambia


In [None]:
hs_8 = pd.read_csv('data/codes_hs8.csv', sep = ',',skiprows = 1, engine='python', skipfooter = 5, index_col=False)
# hs_8 = pd.read_csv('codes_hs8.csv', sep = ',', skipfooter = 5)


In [None]:
pd.set_option('display.max_colwidth', None)
hs_8['Commodity'] = hs_8['Commodity'].str.lower()

In [None]:
hs_lobster = hs_8.loc[hs_8['Commodity'].str.contains('lobster'),:]

In [None]:
hs_lobster['Commodity'].unique()

### Testing univariate ARIMA

With only data from 1 country, for 1 commodity, using Quantity.


In [None]:
# find country with most complete data accross time. most countries have great data since 2017

In [None]:
data

In [None]:
codes = ['0306.22.10', '0306.32.10']



In [None]:
arima_test = filtered_df[filtered_df['Commodity'].str[:10].isin(codes)].sort_values(by='Period')
arima_test.groupby('Country').count().sort_values(by='Period', ascending = False)
# arima_test

In [None]:
# filter for one country --- belgium as example
arima_test = arima_test.loc[arima_test['Country'] == 'Belgium']

In [None]:
# drop cextra columns
arima_test.drop(columns=['Country', 'Value ($)'], inplace = True)
arima_test

In [None]:
# count records by year, check that there is no overlap
arima_test.groupby(arima_test['Period'].dt.year).count()

In [None]:
# arima_test = arima_test.drop(columns=['Commodity'])
arima_test = arima_test.reset_index(drop=True)
arima_test

In [None]:
import seaborn as sns

In [None]:
sns.lineplot(x = arima_test['Period'], y=arima_test['Quantity'])

In [None]:
from pandas import datetime
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot

def parser(x):
 return datetime.strptime('190'+x, '%Y-%m')

series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
autocorrelation_plot(series)
pyplot.show()

In [None]:
japan_data = japan_data[['Period','Quantity']]
japan_data

In [None]:

japan_data = japan_data.groupby('Period').sum(['Quantity', 'Value ($)']).reset_index(drop=False)
japan_data['Value per kg'] = japan_data['Value ($)'] / japan_data['Quantity']
japan_data

In [None]:
japan_data['Year'] = japan_data['Period'].dt.year
japan_data



### other

In [None]:
japan_data_recent = japan_data.loc[japan_data['Period'] > '2020', :].reset_index(drop = True)
japan_data_recent

In [None]:
sns.lineplot(x=japan_data_recent['Year'], y=japan_data_recent['Quantity'])

In [None]:
data

In [None]:
sns.lineplot(x=data['Period'], y=data['Value ($)'])