In [60]:
#import libraries

import pandas as pd
import re

I think probably the best way to clean the HORRIFIC RAW DOWNLOAD cia world factbook pop size thing is to line by line replace the whitespace nonsense with tab delimiter, snake_case the name (maybe, or leave as second step?), and write each line to a new tsv file. 

In [61]:
# define functions
def camelcase_to_snakecase(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower().replace(' ','').replace('-','').replace(',_the','')

In [62]:
#load in raw datasets
suitability_df = pd.read_csv('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/raw-data/americas-zika-suitability.tsv', sep='\t',names=['Country','suitability_score'])
pax_traffic_df = pd.read_excel('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/raw-data/Americas-to-Americas-2012-2016_KK_DB.xlsx')
urban_pop_df = pd.read_csv('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/raw-data/proportion-urban-population-worldbank.csv', sep='\t')

In [63]:
##### Standardize Country Naming #####

#Switch names to snake_case with the above function. This will fix most country names, but not all.
pax_traffic_df['originCountryName'] = pax_traffic_df['originCountryName'].apply(camelcase_to_snakecase)
pax_traffic_df['destinationCountryName'] = pax_traffic_df['destinationCountryName'].apply(camelcase_to_snakecase)
urban_pop_df['Country Name'] = urban_pop_df['Country Name'].apply(camelcase_to_snakecase)
suitability_df['Country'] = suitability_df['Country'].apply(camelcase_to_snakecase)
#To switch the remaining, more problematic names, I'm going to use a name fix tsv file.

#read in name fix tsv, making a dictionary where the key is the name in it's messed up form, and the value 
#is the standardized name in the form I want.
name_fix_df = pd.read_table('/Users/alliblk/Desktop/gitrepos/zika-usvi/scripts/name_fix.tsv',sep='\t')
name_fix_dict = dict(name_fix_df.values)

# make a new column to hold the standardized names
# note that I'm not fixing any messed up names of countries that are not part of our analysis.
# eg bosniaand_herzegovina should be left as is since it's not part of the analysis.
# therefore if name_fix_dict.get() is None I want the same (messed up) row name to be returned.
# all of this will be saved as a new series and appended to the df.
# note axis = 1 just says loop through the rows.

urban_pop_df['country'] = urban_pop_df.apply(lambda row: name_fix_dict.get(row['Country Name'],row['Country Name']),axis=1)
pax_traffic_df['fixedOriginName'] = pax_traffic_df.apply(lambda row: name_fix_dict.get(row['originCountryName'],row['originCountryName']),axis=1)
pax_traffic_df['fixedDestinName'] = pax_traffic_df.apply(lambda row: name_fix_dict.get(row['destinationCountryName'],row['destinationCountryName']),axis=1)
suitability_df['country'] = suitability_df.apply(lambda row: name_fix_dict.get(row['Country'],row['Country']), axis=1)

In [64]:
##### for Urban Population, turn percent (0 to 100 scale) to fraction (0 to 1 scale) for 2016 data we'll use
urban_pop_df['2016_urban_pop_fraction'] = urban_pop_df['2016']/100.0

In [65]:
#Alright, so we've fixed up the raw data frames nicely. Export to tsv.
suitability_df.to_csv('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-zika-suitability.tsv', index = False, columns = ['country', 'suitability_score'], sep = '\t')
pax_traffic_df.to_csv('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-pax-volume.tsv', index=False, columns = ['fixedOriginName', 'fixedDestinName' , 'PAXVolume'], sep = '\t')
urban_pop_df.to_csv('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-urban-pop-fraction.tsv', index=False, columns = ['country','2016_urban_pop_fraction'], sep='\t')