In [338]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import datetime
import xml.etree.ElementTree as ET

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [339]:
# Import World Bank Tourism Arrival CSV file

df_arrivals_original = pd.read_csv('API_ST.INT.ARVL_DS2_en_csv_v2_1740745.csv', header=2)
df_arrivals = df_arrivals_original.copy()

df_arrivals.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,619000.0,641000.0,650000.0,647000.0,683000.0,721000.0,691000.0,643000.0,642000.0,728000.0,733000.0,694000.0,772000.0,827000.0,813000.0,824000.0,869000.0,904000.0,979000.0,1072000.0,1225000.0,1102000.0,1070500.0,1082000.0,,,
1,Afghanistan,AFG,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Angola,AGO,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9000.0,21000.0,45000.0,52000.0,45000.0,51000.0,67000.0,91000.0,107000.0,194000.0,210000.0,121000.0,195000.0,294000.0,366000.0,425000.0,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,,,
3,Albania,ALB,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1062000.0,1247000.0,1711000.0,2191000.0,2469000.0,3156000.0,2857000.0,3341000.0,3784000.0,4070000.0,4643000.0,5340000.0,,,
4,Andorra,AND,"International tourism, number of arrivals",ST.INT.ARVL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2347000.0,2949000.0,3516000.0,3387000.0,3138000.0,2791000.0,2418000.0,2227000.0,2189000.0,2059000.0,1830000.0,1808000.0,2242000.0,2238000.0,2328000.0,2363000.0,2663000.0,2819000.0,3003000.0,3042000.0,,,


In [340]:
# Filter Tourism Arrival to keep needed columns (years)

cols_keep = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
years_keep = []
for year in range(2012, 2019):
    years_keep.append(str(year))
cols_keep = cols_keep + years_keep
df_arrivals = df_arrivals[cols_keep]

# Drop countries (rows) with any missing values in relevant years

df_arrivals = df_arrivals.dropna()

# Convert from wide to long format

df_arrivals = pd.melt(df_arrivals, id_vars =['Country Name', 'Country Code'], value_vars = years_keep) 

#print(df_arrivals.dtypes)
df_arrivals.head()

Unnamed: 0,Country Name,Country Code,variable,value
0,Aruba,ABW,2012,904000.0
1,Angola,AGO,2012,528000.0
2,Albania,ALB,2012,3156000.0
3,Andorra,AND,2012,2238000.0
4,Arab World,ARB,2012,84844590.0


In [341]:
#### USE THIS CELL TO IMPORT FROM AN XML ONCE ERROR IS RESOLVED ####

# Import World Bank Population XML file

#------------

# XML parsing code adapted from:
# https://stackabuse.com/reading-and-writing-xml-files-in-python-with-pandas/

# xml_data = open('API_SP.POP.TOTL_DS2_en_xml_v2_1741834.xml', 'r').read()  # Read file
# root = ET.XML(xml_data)  # Parse XML
# data = []
# xml_cols = []
# for i, child in enumerate(root):
#     data.append([subchild.text for subchild in child])
#     xml_cols.append(child.tag)

# df_population_original = pd.DataFrame(data).T  # Write in DF and transpose it
# df_population_original.columns = xml_cols  # Update column names
# df_population = df_population_original.copy()

# df_population.head()

#------------

# from lxml import objectify
# import pandas as pd

# xml_data = objectify.parse('test1.xml')  # Parse XML data
# root = xml_data.getroot()  # Root element

# data = []
# cols = []
# for i in range(len(root.getchildren())):
#     child = root.getchildren()[i]
#     data.append([subchild.text for subchild in child.getchildren()])
#     cols.append(child.tag)

# df = pd.DataFrame(data).T  # Create DataFrame and transpose it
# df.columns = cols  # Update column names
# print(df)

#-----------

# # import required modules 
# from bs4 import BeautifulSoup 
  
# # reading content 
# file = open("test1.xml", "r") 
# contents = file.read() 
  
# # parsing 
# soup = BeautifulSoup(contents, 'xml') 
# titles = soup.find_all('title') 
  
# # display content 
# for data in titles: 
#     print(data.get_text()) 

#-------------

# API_ST.INT.ARVL_DS2_en_csv_v2_1740745.csv
# API_SP.POP.TOTL_DS2_en_xml_v2_1741834.xml

#  df_population.head(300)

In [342]:
# Import World Bank Population CSV file

df_population_original = pd.read_csv('API_SP.POP.TOTL_DS2_en_csv_v2_1835328.csv', header=2)
df_population = df_population_original.copy()

df_population.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54211.0,55438.0,56225.0,56695.0,57032.0,57360.0,57715.0,58055.0,58386.0,58726.0,59063.0,59440.0,59840.0,60243.0,60528.0,60657.0,60586.0,60366.0,60103.0,59980.0,60096.0,60567.0,61345.0,62201.0,62836.0,63026.0,62644.0,61833.0,61079.0,61032.0,62149.0,64622.0,68235.0,72504.0,76700.0,80324.0,83200.0,85451.0,87277.0,89005.0,90853.0,92898.0,94992.0,97017.0,98737.0,100031.0,100834.0,101222.0,101358.0,101455.0,101669.0,102046.0,102560.0,103159.0,103774.0,104341.0,104872.0,105366.0,105845.0,106314.0,,
1,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996973.0,9169410.0,9351441.0,9543205.0,9744781.0,9956320.0,10174836.0,10399926.0,10637063.0,10893776.0,11173642.0,11475445.0,11791215.0,12108963.0,12412950.0,12689160.0,12943093.0,13171306.0,13341198.0,13411056.0,13356511.0,13171673.0,12882528.0,12537730.0,12204292.0,11938208.0,11736179.0,11604534.0,11618005.0,11868877.0,12412308.0,13299017.0,14485546.0,15816603.0,17075727.0,18110657.0,18853437.0,19357126.0,19737765.0,20170844.0,20779953.0,21606988.0,22600770.0,23680871.0,24726684.0,25654277.0,26433049.0,27100536.0,27722276.0,28394813.0,29185507.0,30117413.0,31161376.0,32269589.0,33370794.0,34413603.0,35383128.0,36296400.0,37172386.0,38041754.0,,
2,Angola,AGO,"Population, total",SP.POP.TOTL,5454933.0,5531472.0,5608539.0,5679458.0,5735044.0,5770570.0,5781214.0,5774243.0,5771652.0,5803254.0,5890365.0,6040777.0,6248552.0,6496962.0,6761380.0,7024000.0,7279509.0,7533735.0,7790707.0,8058067.0,8341289.0,8640446.0,8952950.0,9278096.0,9614754.0,9961997.0,10320111.0,10689250.0,11068050.0,11454777.0,11848386.0,12248901.0,12657366.0,13075049.0,13503747.0,13945206.0,14400719.0,14871570.0,15359601.0,15866869.0,16395473.0,16945753.0,17519417.0,18121479.0,18758145.0,19433602.0,20149901.0,20905363.0,21695634.0,22514281.0,23356246.0,24220661.0,25107931.0,26015780.0,26941779.0,27884381.0,28842484.0,29816748.0,30809762.0,31825295.0,,
3,Albania,ALB,"Population, total",SP.POP.TOTL,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,1965598.0,2022272.0,2081695.0,2135479.0,2187853.0,2243126.0,2296752.0,2350124.0,2404831.0,2458526.0,2513546.0,2566266.0,2617832.0,2671997.0,2726056.0,2784278.0,2843960.0,2904429.0,2964762.0,3022635.0,3083605.0,3142336.0,3227943.0,3286542.0,3266790.0,3247039.0,3227287.0,3207536.0,3187784.0,3168033.0,3148281.0,3128530.0,3108778.0,3089027.0,3060173.0,3051010.0,3039616.0,3026939.0,3011487.0,2992547.0,2970017.0,2947314.0,2927519.0,2913021.0,2905195.0,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0,,
4,Andorra,AND,"Population, total",SP.POP.TOTL,13411.0,14375.0,15370.0,16412.0,17469.0,18549.0,19647.0,20758.0,21890.0,23058.0,24276.0,25559.0,26892.0,28232.0,29520.0,30705.0,31777.0,32771.0,33737.0,34818.0,36067.0,37500.0,39114.0,40867.0,42706.0,44600.0,46517.0,48455.0,50434.0,52448.0,54509.0,56671.0,58888.0,60971.0,62677.0,63850.0,64360.0,64327.0,64142.0,64370.0,65390.0,67341.0,70049.0,73182.0,76244.0,78867.0,80993.0,82684.0,83862.0,84463.0,84449.0,83747.0,82427.0,80774.0,79213.0,78011.0,77297.0,77001.0,77006.0,77142.0,,


In [343]:
# Filter Population to keep needed columns (years)

cols_keep = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
years_keep = []
for year in range(2012, 2019):
    years_keep.append(str(year))
cols_keep = cols_keep + years_keep
df_population = df_population[cols_keep]

# Drop countries (rows) with any missing values in relevant years

df_population = df_population.dropna()

# Convert from wide to long format

df_population = pd.melt(df_population, id_vars =['Country Name', 'Country Code'], value_vars = years_keep) 

# df_population.dtypes
df_population.head()

Unnamed: 0,Country Name,Country Code,variable,value
0,Aruba,ABW,2012,102560.0
1,Afghanistan,AFG,2012,31161376.0
2,Angola,AGO,2012,25107931.0
3,Albania,ALB,2012,2900401.0
4,Andorra,AND,2012,82427.0


In [344]:
# Join Tourism Arrival and Population dfs

tourism_pop_df = df_arrivals.merge(df_population, left_on=['Country Code', 'variable'], 
                                   right_on = ['Country Code', 'variable'], how='left')

# Clean up columns and column names

tourism_pop_df = tourism_pop_df.rename(columns={"Country Name_x": "country_txt_wb", "Country Code": "country_code", "variable": "year_wb", "value_x": "arrivals", "value_y": "population"})
tourism_pop_df = tourism_pop_df[["country_txt_wb", "country_code", "year_wb", "arrivals", "population"]]

tourism_pop_df.head()

Unnamed: 0,country_txt_wb,country_code,year_wb,arrivals,population
0,Aruba,ABW,2012,904000.0,102560.0
1,Angola,AGO,2012,528000.0,25107931.0
2,Albania,ALB,2012,3156000.0,2900401.0
3,Andorra,AND,2012,2238000.0,82427.0
4,Arab World,ARB,2012,84844590.0,371443547.0


In [345]:
# Import mapping CSV file between GTD country IDs and WB country codes (CSV file)

df_country_mapping_original = pd.read_csv('gtd_to_wb_country_mapping.csv', header=0)
df_country_mapping = df_country_mapping_original.copy()
df_country_mapping = df_country_mapping.drop(df_country_mapping.index[0])

df_country_mapping.head()

Unnamed: 0,country_id,gtd_name,wb_country_code,wb_name
1,4.0,Afghanistan,AFG,Afghanistan
2,5.0,Albania,ALB,Albania
3,6.0,Algeria,DZA,Algeria
4,7.0,Andorra,AND,Andorra
5,8.0,Angola,AGO,Angola


In [346]:
# Join GTD country IDs to final tourism + Population dataframe

tourism_pop_df = tourism_pop_df.merge(df_country_mapping, left_on=['country_code'], 
                                   right_on = ['wb_country_code'], how='left')
tourism_pop_df = tourism_pop_df.drop(['gtd_name', 'wb_country_code', 'wb_name'], axis=1)

# Clean up by reordering columns and casting appropriate columns as integers

tourism_pop_df = tourism_pop_df[['year_wb', 'country_id', 'country_txt_wb', 'country_code', 'arrivals', 'population']]

# Drop countries (rows) where there is not a country_id (from GTD) for the WB country code

tourism_pop_df = tourism_pop_df.dropna()
tourism_pop_df = tourism_pop_df.reset_index(drop=True)

# Cast relevant columns to integers

tourism_pop_df['year_wb'] = tourism_pop_df['year_wb'].astype(int) 
tourism_pop_df['country_id'] = tourism_pop_df['country_id'].astype(int) 
tourism_pop_df['arrivals'] = tourism_pop_df['arrivals'].astype(int) 
tourism_pop_df['population'] = tourism_pop_df['population'].astype(int) 

#print(tourism_pop_df.dtypes)
tourism_pop_df.head(5000)

Unnamed: 0,year_wb,country_id,country_txt_wb,country_code,arrivals,population
0,2012,8,Angola,AGO,528000,25107931
1,2012,5,Albania,ALB,3156000,2900401
2,2012,7,Andorra,AND,2238000,82427
3,2012,11,Argentina,ARG,6497000,41733271
4,2012,12,Armenia,ARM,963000,2884229
5,2012,10,Antigua and Barbuda,ATG,247000,90409
6,2012,14,Australia,AUS,6032000,22733465
7,2012,15,Austria,AUT,24151000,8429991
8,2012,16,Azerbaijan,AZE,1986000,9295784
9,2012,21,Belgium,BEL,7560000,11106932


In [347]:
# Write final output of WB Tourism and Poluation data (with a matched GTD county_id) to a CSV file

tourism_pop_df.to_csv('tourism_pop.csv')