In [12]:
# Import libraries

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET

#pd.options.display.max_columns = None
#pd.options.display.max_rows = None

In [13]:
# Import World Bank Tourism Arrival CSV file

df_arrivals_original = pd.read_csv('../data/raw/API_ST.INT.ARVL_DS2_en_csv_v2_1740745.csv', header=2)
df_arrivals = df_arrivals_original.copy()

df_arrivals.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,904000.0,979000.0,1072000.0,1225000.0,1102000.0,1070500.0,1082000.0,,,
1,Afghanistan,AFG,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,,,,,,,,,,
2,Angola,AGO,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,,,
3,Albania,ALB,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,3156000.0,2857000.0,3341000.0,3784000.0,4070000.0,4643000.0,5340000.0,,,
4,Andorra,AND,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,...,2238000.0,2328000.0,2363000.0,2663000.0,2819000.0,3003000.0,3042000.0,,,


In [14]:
# Filter Tourism Arrival to keep needed columns (years)

cols_keep = ['Country Name', 'Country Code']
years_keep = [str(x) for x in [*range(2012,2019)]] # unpacks range into a list of strings
cols_keep = cols_keep + years_keep
df_arrivals = df_arrivals[cols_keep]

# Drop countries (rows) with any missing values in relevant years

df_arrivals = df_arrivals.dropna()

# Convert from wide to long format

df_arrivals = pd.melt(df_arrivals, id_vars =['Country Name', 'Country Code'], value_vars = years_keep) 

#print(df_arrivals.dtypes)
df_arrivals.head()

Unnamed: 0,Country Name,Country Code,variable,value
0,Aruba,ABW,2012,904000.0
1,Angola,AGO,2012,528000.0
2,Albania,ALB,2012,3156000.0
3,Andorra,AND,2012,2238000.0
4,Arab World,ARB,2012,84844590.0


In [15]:
# Import World Bank Population XML file

# XML parsing code adapted from:
# https://stackabuse.com/reading-and-writing-xml-files-in-python-with-pandas/

xml_data = open('../data/raw/API_SP.POP.TOTL_DS2_en_xml_v2_1741834.xml', 'r').read()  # Read file

root = ET.XML(xml_data)  # Parse XML
data = []
xml_cols = []
for i, child in enumerate(root[0]):
    data.append([subchild.text for subchild in child])

df_population_original = pd.DataFrame(data)  # Write in DF
df_population = df_population_original.copy()

df_population.columns = ['Country Name', 'Indicator Name', 'Year', 'Population'] 
df_population = df_population[['Country Name', 'Year', 'Population']]

df_population = df_population.pivot(index='Country Name', columns='Year', values='Population').reset_index()

df_population.head()

## NOTE: These results are alphabetized by country name, not by code like the
## tourism arrival CSV file below

Year,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Afghanistan,8996973,9169410,9351441,9543205,9744781,9956320,10174836,10399926,10637063,...,30117413,31161376,32269589,33370794,34413603,35383128,36296400,37172386,38041754,
1,Albania,1608800,1659800,1711319,1762621,1814135,1864791,1914573,1965598,2022272,...,2905195,2900401,2895092,2889104,2880703,2876101,2873457,2866376,2854191,
2,Algeria,11057863,11336339,11619828,11912803,12221675,12550885,12902627,13275026,13663583,...,36661444,37383887,38140132,38923687,39728025,40551404,41389198,42228429,43053054,
3,American Samoa,20123,20602,21253,22034,22854,23672,24462,25248,25989,...,55759,55667,55713,55791,55812,55741,55620,55465,55312,
4,Andorra,13411,14375,15370,16412,17469,18549,19647,20758,21890,...,83747,82427,80774,79213,78011,77297,77001,77006,77142,


In [16]:
# Filter Population to keep needed columns (years)

cols_keep = ['Country Name']
cols_keep = cols_keep + years_keep
df_population = df_population[cols_keep]

# Drop countries (rows) with any missing values in relevant years

df_population = df_population.dropna()

# Convert from wide to long format

df_population = pd.melt(df_population, id_vars =['Country Name'], value_vars = years_keep) 

# df_population.dtypes
df_population.head()

Unnamed: 0,Country Name,Year,value
0,Afghanistan,2012,31161376
1,Albania,2012,2900401
2,Algeria,2012,37383887
3,American Samoa,2012,55667
4,Andorra,2012,82427


In [17]:
# Join Tourism Arrival and Population dfs

tourism_pop_df = df_arrivals.merge(df_population, left_on=['Country Name', 'variable'], 
                                   right_on = ['Country Name', 'Year'], how='left')

# Clean up columns and column names

tourism_pop_df = tourism_pop_df.rename(columns={"Country Name": "country_txt_wb", 
                                "Country Code": "country_code", 
                                "variable": "year_wb", "value_x": "arrivals", 
                                "value_y": "population"})
tourism_pop_df = tourism_pop_df[["country_txt_wb", "country_code", "year_wb", 
                                 "arrivals", "population"]]

tourism_pop_df.head()

Unnamed: 0,country_txt_wb,country_code,year_wb,arrivals,population
0,Aruba,ABW,2012,904000.0,102560
1,Angola,AGO,2012,528000.0,25107931
2,Albania,ALB,2012,3156000.0,2900401
3,Andorra,AND,2012,2238000.0,82427
4,Arab World,ARB,2012,84844590.0,371443547


In [18]:
# Import mapping CSV file between GTD country IDs and WB country codes (CSV file)

df_country_mapping_original = pd.read_csv('../data/raw/gtd_to_wb_country_mapping.csv', header=0)
df_country_mapping = df_country_mapping_original.copy()
df_country_mapping = df_country_mapping.drop(df_country_mapping.index[0])

df_country_mapping.head()

Unnamed: 0,country_id,gtd_name,wb_country_code,wb_name
1,4.0,Afghanistan,AFG,Afghanistan
2,5.0,Albania,ALB,Albania
3,6.0,Algeria,DZA,Algeria
4,7.0,Andorra,AND,Andorra
5,8.0,Angola,AGO,Angola


In [19]:
# Join GTD country IDs to final tourism + Population dataframe

tourism_pop_df = tourism_pop_df.merge(df_country_mapping, left_on=['country_code'], 
                                   right_on = ['wb_country_code'], how='left')
tourism_pop_df = tourism_pop_df.drop(['gtd_name', 'wb_country_code', 'wb_name'], axis=1)

# Clean up by reordering columns and casting appropriate columns as integers

tourism_pop_df = tourism_pop_df[['year_wb', 'country_id', 'country_txt_wb', 'country_code', 'arrivals', 'population']]

# Drop countries (rows) where there is not a country_id (from GTD) for the WB country code

tourism_pop_df = tourism_pop_df.dropna()
tourism_pop_df = tourism_pop_df.reset_index(drop=True)

# Cast relevant columns to integers

tourism_pop_df['year_wb'] = tourism_pop_df['year_wb'].astype(float).astype(int)
tourism_pop_df['country_id'] = tourism_pop_df['country_id'].astype(float).astype(int) 
tourism_pop_df['arrivals'] = tourism_pop_df['arrivals'].astype(float).astype(int)
tourism_pop_df['population'] = tourism_pop_df['population'].astype(float).astype(int)

#print(tourism_pop_df.dtypes)
tourism_pop_df.head()

Unnamed: 0,year_wb,country_id,country_txt_wb,country_code,arrivals,population
0,2012,8,Angola,AGO,528000,25107931
1,2012,5,Albania,ALB,3156000,2900401
2,2012,7,Andorra,AND,2238000,82427
3,2012,11,Argentina,ARG,6497000,41733271
4,2012,12,Armenia,ARM,963000,2884229


In [20]:
# Add country rank of Tourism, from largest to smallest

# First create rank df

country_rank_df = tourism_pop_df.copy()
country_rank_df = country_rank_df[country_rank_df['year_wb'] == 2012]
country_rank_df['tourism_rank_2012'] = country_rank_df['arrivals'].rank(ascending=False).astype(int)
country_rank_df = country_rank_df[['country_txt_wb', 'tourism_rank_2012']]
country_rank_df = country_rank_df.set_index('country_txt_wb')

# Then create rank dictionary for mapping rank

country_rank_dict = country_rank_df.to_dict().get('tourism_rank_2012')
#country_rank_dict

In [21]:
# Map rank dictionary to original tourism_pop_df dataframe

tourism_pop_df['tourism_rank_2012'] = tourism_pop_df['country_txt_wb'].map(country_rank_dict) 

#tourism_pop_df.sort_values(by=['tourism_rank_2012', 'year_wb'], ascending=True).head(2000)
tourism_pop_df.head()

Unnamed: 0,year_wb,country_id,country_txt_wb,country_code,arrivals,population,tourism_rank_2012
0,2012,8,Angola,AGO,528000,25107931,113
1,2012,5,Albania,ALB,3156000,2900401,57
2,2012,7,Andorra,AND,2238000,82427,73
3,2012,11,Argentina,ARG,6497000,41733271,43
4,2012,12,Armenia,ARM,963000,2884229,97


In [22]:
# Write final output of WB Tourism and Poluation data (with a matched GTD county_id) to a CSV file

tourism_pop_df.to_csv('../data/derived/tourism_pop.csv')