In [1]:
import pandas as pd
import numpy as np

## README: Nonpairwise predictors

Some of the predictors within the GLM are not inherently pairwise. As such, each predictor (e.g. country's population size) will have a tsv file that represents the predictor value at the origin and another file representing that value at the destination. 

#### Examples:
An origin tsv file (value of predictor comes from the origin country):

origin    | destination    |   value
----------|----------------|----------
canada    | mexico         |  200
canada    | united_states  |  200
canada    | guatemala      |  200

A destination tsv file (value of predictor comes from the destination country):

origin    | destination    |   value
----------|----------------|----------
canada    | mexico         |  125
canada    | united_states  |  265
canada    | guatemala      |  95

In [18]:
# file paths for data that will need to be merged.
# using indexed countries file with countries with < 50,000 inhabitants dropped (45 country file)
indexed_countries_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/indexed-countries-45.tsv"
pop_sizes_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-population-size.tsv"
urban_pop_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-urban-pop-fraction.tsv"
suitability_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/standardized-data/std-zika-suitability.tsv"

In [19]:
master_df = pd.read_table(indexed_countries_path)
#read in from other files that have information that we want, but from more countries than the set we care about.
#merge dataframes on the indexed countries df to ensure we only keep records from countries of interest.

#country population sizes
pop_size_df = pd.read_table(pop_sizes_path)
master_df = pd.merge(master_df, pop_size_df, on='country', how='left')

#proportion of the population that lives in an urban area
urban_pop_df = pd.read_table(urban_pop_path)
master_df = pd.merge(master_df, urban_pop_df, on='country', how='left')

#zika environmental suitability values
suitability_df = pd.read_table(suitability_path)
master_df = pd.merge(master_df, suitability_df, on='country', how='left')

In [20]:
#CIA world fact book does not have population sizes for the following countries in our dataset.
#World bank also lacks proportion urban population for these same countries.
#These will need to be looked up somewhere else (e.g. UN population size data) and entered manually.
master_df[master_df['population_size'].isnull()]

Unnamed: 0,country,region,population_size,2016_urban_pop_fraction,suitability_score
22,guadeloupe,caribbean,,,0.80957
24,martinique,caribbean,,,0.869583
32,french_guiana,south_america,,,0.893181


In [21]:
master_df[master_df['2016_urban_pop_fraction'].isnull()]

Unnamed: 0,country,region,population_size,2016_urban_pop_fraction,suitability_score
22,guadeloupe,caribbean,,,0.80957
24,martinique,caribbean,,,0.869583
32,french_guiana,south_america,,,0.893181


In [22]:
# fill in pop sizes that weren't in CIA factbook manually, using census data
# below: intuition for funky pandas indexing.
# using locate (.loc), tell it which row you want (with conditional), then say which column you're editing == value.)

master_df.loc[master_df['country'] == 'guadeloupe', 'population_size'] = 402119.0 #source: https://www.insee.fr/fr/statistiques/1895182
master_df.loc[master_df['country'] == 'martinique', 'population_size'] = 385551.0 #source: https://www.insee.fr/fr/statistiques/1895162
master_df.loc[master_df['country'] == 'french_guiana', 'population_size'] = 252338.0 #source: https://www.insee.fr/fr/statistiques/1405599?geo=REG-03

In [23]:
#fill in proportions of population that live in urban areas manually, using UN data
#Note that these estimates are actually from 2015 (not 2016 like world bank estimates)
#Estimate for Guadeloupe also includes Saint-Barthélemy and Saint-Martin (French part).  

master_df.loc[master_df['country'] == 'guadeloupe', '2016_urban_pop_fraction'] = 0.984 #source: http://data.un.org/CountryProfile.aspx?crName=Guadeloupe
master_df.loc[master_df['country'] == 'martinique', '2016_urban_pop_fraction'] = 0.889 #source: http://data.un.org/CountryProfile.aspx?crName=Martinique
master_df.loc[master_df['country'] == 'french_guiana', '2016_urban_pop_fraction'] = 0.844 #source: http://data.un.org/CountryProfile.aspx?crName=French%20Guiana

In [24]:
master_df

Unnamed: 0,country,region,population_size,2016_urban_pop_fraction,suitability_score
0,canada,north_america,35623680.0,0.82006,0.0
1,united_states,north_america,326625791.0,0.81788,0.208427
2,bermuda,north_america,70864.0,1.0,0.099756
3,mexico,north_america,124574795.0,0.79517,0.306216
4,belize,central_america,360346.0,0.43845,0.804038
5,guatemala,central_america,15460732.0,0.5203,0.450047
6,honduras,central_america,9038741.0,0.55315,0.863965
7,el_salvador,central_america,6172011.0,0.67189,0.950706
8,nicaragua,central_america,6025951.0,0.59107,0.892162
9,costa_rica,central_america,4930258.0,0.77675,0.864093


In [26]:
master_df[master_df['population_size'] < 50000] #check to make sure we have removed all countries with pop sizes < 50,000

Unnamed: 0,country,region,population_size,2016_urban_pop_fraction,suitability_score


In [28]:
#write out origin/destination formatted tsv files

origin_popsize_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/origin-population-size.tsv'
destin_popsize_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/destination-population-size.tsv'


with open(origin_popsize_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'origin_population_size'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == origin, 'population_size'].values[0]))

with open(destin_popsize_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'destination_population_size'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == destination, 'population_size'].values[0]))

                
origin_urban_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/origin-urban-proportion.tsv'
destin_urban_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/destination-urban-proportion.tsv'


with open(origin_urban_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'origin_urban_proportion'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == origin, '2016_urban_pop_fraction'].values[0]))

with open(destin_urban_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'destination_urban_proportion'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == destination, '2016_urban_pop_fraction'].values[0]))
                
origin_suitability_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/origin-suitability.tsv'
destin_suitability_outfile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/glm/origin-destin-formatted-data/destination-suitability.tsv'


with open(origin_suitability_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'origin_zika_suitability'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == origin, 'suitability_score'].values[0]))

with open(destin_suitability_outfile,'w') as file:
    file.write('{}\t{}\t{}\n'.format('origin', 'destination', 'destination_zika_suitability'))
    for origin in master_df['country']:
        for destination in master_df['country']:
            if origin == destination:
                continue
            else:
                file.write('{}\t{}\t{}\n'.format(origin, destination, master_df.loc[master_df['country'] == destination, 'suitability_score'].values[0]))
          