In [1]:
test_countries = ['united_states','canada','mexico']

In [24]:
import math
import numpy as np

Once linearized, the 1D predictor matrix should be length_matrix times length_matrix - 1 long. This is because the diagonal values (eg canada to canada) are removed from the linearized matrix before it is put in the xml. Thus, if you have 45 countries in your GLM, then the linearized matrix should be 45 * 44 entries long. 

Note that indexing for GLM matrices in BEAST is also a little different. All entries to the top of the diagonal are sequentially filled in first, row by row. Then cell entries on the bottom of the matrix are filled in, row by row.

XXXXXXX| usa |canada| mexico
-------|-----|-------|-------
usa    |XXXXX|  0    | 1
canada |3    |XXXXXXX| 2
mexico |4    |  5    | XXXXXX

When linearized for the xml, the above predictor matrix takes the form of `[0, 1, 2, 3, 4, 5]`.

If we have a dictionary, where the index is the key, and the value is a tuple, in form `(origin, destination)`, then the dictionary should look like this:

`{0: ('usa', 'canada'), 1: ('usa', 'mexico'), 2: ('canada', 'mexico'), 3: ('canada', 'usa'), 4: ('mexico', 'usa'), 5: ('mexico', 'canada')}`

Below, I'm formatting things a little differently. I'm actually going to use a nested dictionary structure where the key is the (origin,destination) tuple, and the value is a dictionary that includes index value, and  predictor values as pulled from a tsv.

In [None]:
# specify functions

def logMatrix(matrix):
    transformed_matrix = [math.log(value) for value in matrix]
    return transformed_matrix

def standardizeMatrix(matrix):
    mean = np.mean(matrix)
    stdev = np.std(matrix)
    standardized_matrix = [(value - mean)/stdev for value in matrix]
    return standardized_matrix

In [17]:
# code to make the indexing dictionaries, adapted from Gytis Dudas' EBOV iPython notebook.
# nothing crazy, just annoying math to deal with the bizarre GLM indexing.

test_predictor_dict={}
matrix_length=len(test_countries)
for i in range(len(test_countries)):
    for j in range(i+1,len(test_countries)): #make the second iteration 1 shorter than the first
        index_1=int((matrix_length*(matrix_length-1)/2) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)
        index_2=int((matrix_length*(matrix_length-1)) - (matrix_length-i)*((matrix_length-i)-1)/2 + j - i - 1)

        test_predictor_dict[index_1] = {'country_pair':(test_countries[i],test_countries[j])}
        test_predictor_dict[index_2] = {'country_pair':(test_countries[j],test_countries[i])}


In [30]:
# try importing test predictor set, and assigning great circle distances

test_infile = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/test-predictors.tsv'
with open(test_infile,'rU') as file:
    for line in file:
        if line.startswith('origin'): # this line is the header of the tsv
            predictor  = line.strip().split('\t')[2] #predictor namein the tsv is what it will be called in dict
        else:
            country_tuple = (line.split('\t')[0],line.split('\t')[1]) # origin,destination tuple
            for key in test_predictor_dict.keys():
                if test_predictor_dict[key]['country_pair'] == country_tuple:
                    test_predictor_dict[key][predictor] = float(line.strip().split('\t')[2])

In [31]:
print test_predictor_dict

{0: {'country_pair': ('united_states', 'canada'), 'great_circle_dist_km': 1077.10699075}, 1: {'country_pair': ('united_states', 'mexico'), 'great_circle_dist_km': 1922.06469663}, 2: {'country_pair': ('canada', 'mexico'), 'great_circle_dist_km': 2997.40071088}, 3: {'country_pair': ('canada', 'united_states'), 'great_circle_dist_km': 1077.10699075}, 4: {'country_pair': ('mexico', 'united_states'), 'great_circle_dist_km': 1922.06469663}, 5: {'country_pair': ('mexico', 'canada'), 'great_circle_dist_km': 2997.40071088}}


In [32]:
# CHECK TO MAKE SURE EVERYTHING IS WORKING RIGHT!!! Should match up with indexing of markdown table above.

test_linearized_predictor = []

for i in range(len(test_predictor_dict)):
    test_linearized_predictor.append(test_predictor_dict[i]['great_circle_dist_km'])

print test_linearized_predictor

[1077.10699075, 1922.06469663, 2997.40071088, 1077.10699075, 1922.06469663, 2997.40071088]


In [39]:
#try out log transforming matrix

testLog_predictor = logMatrix(test_linearized_predictor)
print testLog_predictor #yes, this looks right
print np.mean(testLog_predictor)
print np.std(testLog_predictor)

[6.982034013680106, 7.561155250061531, 8.00550076237641, 6.982034013680106, 7.561155250061531, 8.00550076237641]
7.51623000871
0.41903440941


In [43]:
std_matrix = standardizeMatrix(testLog_predictor)
print std_matrix

[-1.2748260835608844, 0.10721134194857139, 1.1676147416123193, -1.2748260835608844, 0.10721134194857139, 1.1676147416123193]
