In [1]:
test_countries = ['united_states','canada','mexico']

In [49]:
import math
import numpy as np
import glob

Once linearized, the 1D predictor matrix should be length_matrix times length_matrix - 1 long. This is because the diagonal values (eg canada to canada) are removed from the linearized matrix before it is put in the xml. Thus, if you have 45 countries in your GLM, then the linearized matrix should be 45 * 44 entries long. 

Note that indexing for GLM matrices in BEAST is also a little different. All entries to the top of the diagonal are sequentially filled in first, row by row. Then cell entries on the bottom of the matrix are filled in, row by row.

XXXXXXX| usa |canada| mexico
-------|-----|-------|-------
usa    |XXXXX|  0    | 1
canada |3    |XXXXXXX| 2
mexico |4    |  5    | XXXXXX

When linearized for the xml, the above predictor matrix takes the form of `[0, 1, 2, 3, 4, 5]`.

If we have a dictionary, where the index is the key, and the value is a tuple, in form `(origin, destination)`, then the dictionary should look like this:

`{0: ('usa', 'canada'), 1: ('usa', 'mexico'), 2: ('canada', 'mexico'), 3: ('canada', 'usa'), 4: ('mexico', 'usa'), 5: ('mexico', 'canada')}`

Below, I'm formatting things a little differently. I'm actually going to use a nested dictionary structure where the key is the (origin,destination) tuple, and the value is a dictionary that includes index value, and  predictor values as pulled from a tsv.

In [143]:
# specify functions

def logMatrix(matrix):
    transformed_matrix = [np.log(value) for value in matrix]
    return transformed_matrix

def standardizeMatrix(matrix):
    mean = np.nanmean(matrix)
    stdev = np.nanstd(matrix)
    standardized_matrix = [(value - mean)/stdev for value in matrix if value != float('nan')]
    return standardized_matrix

def make_GLMmatrix(predictor_dict,predictor_name_string):
    linearized_predictor = []
    for i in range(len(predictor_dict)):
        try: 
            linearized_predictor.append(predictor_dict[i][predictor_name_string])
        except KeyError:
            linearized_predictor.append(float('nan'))
    logged_matrix = logMatrix(linearized_predictor)
    std_log_matrix = standardizeMatrix(logged_matrix)
    return std_log_matrix


### Initially try everything out on a test dataset.

I've made a little test tsv file, and have a small list of three countries that I want to set up predictors for. Just making sure here that everything is working the way I think it should on a dataset that's small enough that I can spot errors easily.

In [4]:
# code to make the indexing dictionaries, adapted from Gytis Dudas' EBOV iPython notebook.
# nothing crazy, just annoying math to deal with the bizarre GLM indexing.

test_predictor_dict={}
matrix_length=len(test_countries)
for i in range(len(test_countries)):
    for j in range(i+1,len(test_countries)): #make the second iteration 1 shorter than the first
        index_1=int((matrix_length*(matrix_length-1)/2) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)
        index_2=int((matrix_length*(matrix_length-1)) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)

        test_predictor_dict[index_1] = {'country_pair':(test_countries[i],test_countries[j])}
        test_predictor_dict[index_2] = {'country_pair':(test_countries[j],test_countries[i])}


In [6]:
# try importing test predictor set, and assigning great circle distances

test_infile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/test-predictors.tsv'
with open(test_infile,'rU') as file:
    for line in file:
        if line.startswith('origin'): # this line is the header of the tsv
            predictor  = line.strip().split('\t')[2] #predictor name in the tsv is what it will be called in dict
        else:
            country_tuple = (line.split('\t')[0],line.split('\t')[1]) # origin,destination tuple
            for key in test_predictor_dict.keys():
                if test_predictor_dict[key]['country_pair'] == country_tuple:
                    test_predictor_dict[key][predictor] = float(line.strip().split('\t')[2])

In [7]:
print test_predictor_dict

{0: {'country_pair': ('united_states', 'canada'), 'great_circle_dist_km': 1077.10699075}, 1: {'country_pair': ('united_states', 'mexico'), 'great_circle_dist_km': 1922.06469663}, 2: {'country_pair': ('canada', 'mexico'), 'great_circle_dist_km': 2997.40071088}, 3: {'country_pair': ('canada', 'united_states'), 'great_circle_dist_km': 1077.10699075}, 4: {'country_pair': ('mexico', 'united_states'), 'great_circle_dist_km': 1922.06469663}, 5: {'country_pair': ('mexico', 'canada'), 'great_circle_dist_km': 2997.40071088}}


In [139]:
# CHECK TO MAKE SURE EVERYTHING IS WORKING RIGHT!!! Should match up with indexing of markdown table above.

test_linearized_predictor = []

for i in range(len(test_predictor_dict)):
    print test_predictor_dict[i]['country_pair']
    test_linearized_predictor.append(test_predictor_dict[i]['great_circle_dist_km'])

print test_linearized_predictor

('united_states', 'canada')
('united_states', 'mexico')
('canada', 'mexico')
('canada', 'united_states')
('mexico', 'united_states')
('mexico', 'canada')
[1077.10699075, 1922.06469663, 2997.40071088, 1077.10699075, 1922.06469663, 2997.40071088]


In [140]:
#try out log transforming matrix

testLog_predictor = logMatrix(test_linearized_predictor)
print testLog_predictor #yes, this looks right
print np.mean(testLog_predictor)
print np.std(testLog_predictor)

[6.9820340136801056, 7.5611552500615309, 8.0055007623764105, 6.9820340136801056, 7.5611552500615309, 8.0055007623764105]
7.51623000871
0.41903440941


In [141]:
std_matrix = standardizeMatrix(testLog_predictor)
print std_matrix

[-1.2748260835608844, 0.10721134194857139, 1.1676147416123193, -1.2748260835608844, 0.10721134194857139, 1.1676147416123193]


In [142]:
#test that function that includes logging and standardizing does the same thing:
func_test_matrix = make_GLMmatrix(test_predictor_dict,'great_circle_dist_km')
print func_test_matrix

for i in xrange(len(test_countries)):
    if std_matrix[i] != func_test_matrix[i]:
        print i
        print 'Houston we have a problem'

[-1.2748260835608844, 0.10721134194857139, 1.1676147416123193, -1.2748260835608844, 0.10721134194857139, 1.1676147416123193]


### Testing looks good. Now we'll do it for the full set of 45 countries to include in the GLM. 

1) Import from indexed-countries-45.tsv to get the full list of countries that will be used.
2) Start by importing all countryXcountry great circle distances.

In [19]:
with open('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/indexed-countries-45.tsv') as file:
    countries_list = [line.strip().split('\t')[0] for line in file if not line.startswith('country')]
    
print countries_list #this should be ordered north to south

print '\n {} countries included in analysis'.format(len(countries_list)) #this should be 45

['canada', 'united_states', 'bermuda', 'mexico', 'belize', 'guatemala', 'honduras', 'el_salvador', 'nicaragua', 'costa_rica', 'panama', 'bahamas', 'cuba', 'turks_caicos_islands', 'cayman_islands', 'jamaica', 'haiti', 'dominican_republic', 'puerto_rico', 'united_states_virgin_islands', 'saint_kitts_nevis', 'antigua_barbuda', 'guadeloupe', 'dominica', 'martinique', 'saint_lucia', 'saint_vincent_grenadines', 'barbados', 'grenada', 'trinidad_tobago', 'curacao', 'aruba', 'french_guiana', 'suriname', 'guyana', 'venezuela', 'colombia', 'ecuador', 'peru', 'bolivia', 'brazil', 'paraguay', 'uruguay', 'argentina', 'chile']

 45 countries included in analysis


In [41]:
#make predictor dictionary. Index is key, value is dict with origin,destination tuple.

predictor_dict = {}

matrix_length=len(countries_list)

for i in range(len(countries_list)):
    for j in range(i+1,len(countries_list)): #make the second iteration 1 shorter than the first
        index_1=int((matrix_length*(matrix_length-1)/2) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)
        index_2=int((matrix_length*(matrix_length-1)) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)

        predictor_dict[index_1] = {'country_pair':(countries_list[i],countries_list[j])}
        predictor_dict[index_2] = {'country_pair':(countries_list[j],countries_list[i])}

assert len(predictor_dict) == len(countries_list)**2 - len(countries_list), 'predictor dictionary malformed'

#### As a reference, I'm going to write the index for each country pair to a tsv file.

In [78]:
with open('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/index-countrypair-mapping.tsv','w') as file:
    file.write('{}\t{}\n'.format('index', 'country_pair'))
    for i in range(len(predictor_dict)):
        file.write('{}\t{}\n'.format(i, predictor_dict[i]['country_pair']))

#### Add predictor values to the dictionary based on key matching of the country pair tuple.

In [54]:
path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/*.tsv"

for fname in glob.glob(path):
    #skip testing tsv
    if fname == '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/test-predictors.tsv':
        continue
    else:
        with open(fname,'rU') as file:
            print 'importing predictor from {}'.format(fname)
            for line in file:
                if line.startswith('origin'): # this line is the header of the tsv
                    predictor  = line.strip().split('\t')[2] #predictor name in the tsv is what it will be called in dict
                else:
                    country_tuple = (line.split('\t')[0],line.split('\t')[1]) # origin,destination tuple
                    for key in predictor_dict.keys(): #iterate through the indices
                        if predictor_dict[key]['country_pair'] == country_tuple:
                            predictor_dict[key][predictor] = float(line.strip().split('\t')[2])

importing predictor from /Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/destination-popsize.tsv
importing predictor from /Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/great-circle-dists.tsv
importing predictor from /Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/north-south-indicator.tsv
importing predictor from /Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/origin-popsize.tsv
importing predictor from /Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/predictor-tsv/pax-volume-snakecase.tsv


#### Do some manual checking to make sure that import went smoothly. Print output and look up values in tsv.

##### Note to self:
For the passenger volume data, if there was no volume between two locations, there was no entry in the tsv file. Just need to be careful then that when we write out the matrices we state `NaN` if `pax_volume` does not exist for a country pair. I'm going to make a tsv with these country pairs and their indices to keep track of them more easily.

In [80]:
#Check a few of the entries manually to make sure everything worked.
print len(predictor_dict)
print '\n'
print predictor_dict[0]
print '\n'
print predictor_dict[500]
print '\n'
print predictor_dict[1300]
print '\n'
print predictor_dict[274]
print '\n'

#example of entry with no pax_volume between the origin and the destination. 
#use KeyError as condition.
print predictor_dict[1700]['pax_volume']

1980


{'north_south_indicator': 1.0, 'destination_pop_size': 323995528.0, 'country_pair': ('canada', 'united_states'), 'great_circle_dist_km': 1077.10699075, 'origin_pop_size': 35362905.0, 'pax_volume': 12361348.0}


{'north_south_indicator': 1.0, 'destination_pop_size': 52329.0, 'country_pair': ('turks_caicos_islands', 'saint_kitts_nevis'), 'great_circle_dist_km': 1884.07804402, 'origin_pop_size': 51430.0, 'pax_volume': 45.0}


{'north_south_indicator': -1.0, 'destination_pop_size': 6156670.0, 'country_pair': ('aruba', 'el_salvador'), 'great_circle_dist_km': 2075.47088424, 'origin_pop_size': 113648.0, 'pax_volume': 125.0}


{'destination_pop_size': 252338.0, 'country_pair': ('honduras', 'french_guiana'), 'great_circle_dist_km': 3939.8419472, 'north_south_indicator': 1.0, 'origin_pop_size': 8893259.0}


{'destination_pop_size': 52329.0, 'country_pair': ('paraguay', 'saint_kitts_nevis'), 'great_circle_dist_km': 4749.19993604, 'north_south_indicator': -1.0, 'origin_pop_size': 6862812.0}

In [149]:
# make note of which country pairs do not have any travel between them, and their indices!

with open('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/countries-without-pax-volume.tsv','w') as file:
    file.write('{}\t{}\n'.format('index', 'country_pair_without_pax_volume'))
    for i in range(len(predictor_dict)):
        try:
            predictor_dict[i]['pax_volume']
        except KeyError:
            file.write('{}\t{}\n'.format(i, predictor_dict[i]['country_pair']))

### Dictionary looks good, and values in dict match what I look up manually in the tsv files. Now to export the matrices for each predictor, log transform, and standardize.

For Tuesday: need to figure out what is breaking north_south_matrix transform.

In [155]:
gcd_transformed = make_GLMmatrix(predictor_dict,'great_circle_dist_km')
originPopSz_transformed = make_GLMmatrix(predictor_dict,'origin_pop_size')
destinPopSz_transformed = make_GLMmatrix(predictor_dict,'destination_pop_size')
pax_volume_transformed = make_GLMmatrix(predictor_dict, 'pax_volume')

with open('/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/transformed-linearized-matrices/','w'):

[-0.84505511502939379, 0.21548402909924946, 0.40775586743969749, 0.50634161354634444, 0.61696738620070901, 0.61225131986801407, 0.65474130853847223, 0.70567340993535066, 0.79202988503799654, 0.84348082810185299, 0.21146112772662665, 0.35049625982852151, 0.47202716859598348, 0.45297175878924129, 0.54250357800709514, 0.57000157707956056, 0.60092770337070589, 0.68039655791555653, 0.70651915037242341, 0.90884992525427299, 0.78882317049990314, 0.81692298793805507, 0.84448791753816477, 0.87052455974249499, 0.89002911123151385, 0.9068348508582782, 0.93091709993708793, 0.92711936538944462, 0.97395261058058924, 0.8360548868841009, 0.81290401269595935, 1.2046086988231124, 1.1588830285499836, 1.1095597845519656, 0.90908626070686904, 0.96031577171400606, 1.1357015125660033, 1.3591318743202783, 1.5281629502527887, 1.6643479088215185]


In [154]:
#north_south_transformed = make_GLMmatrix(predictor_dict,'north_south_indicator')


