In [1]:
import pandas as pd
import numpy as np

In [2]:
# file paths for data that will need to be merged.

indexed_countries_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/indexed-countries.tsv"
pop_sizes_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/cia-world-fact-book-popsizes.txt"
paho_case_counts_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/metadata/paho-cumul-case-2017-07-13.xls"
genome_counts_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/all-zika-fauna-nextstrain-counts-2017-07-05.tsv"

In [3]:
master_df = pd.read_table(indexed_countries_path)

In [4]:
#read in from other files that have information that we want, but from more countries than the set we care about.
#merge dataframes on the indexed countries df to ensure we only keep records from countries of interest.


#country population sizes
pop_size_df = pd.read_table(pop_sizes_path)
del pop_size_df['rank'] #doesn't matter, part of world fact book export

#numbers of genomes available, either on fauna, or on nextstrain (sometimes less than what's on fauna)
genome_count_df = pd.read_table(genome_counts_path)

#merging
master_df = pd.merge(master_df, genome_count_df, on='country', how='left')
#remove NaNs from genome counts > replace with zeros
master_df.fillna(value=0.0, inplace= True)

master_df = pd.merge(master_df, pop_size_df, on='country', how='left')


In [5]:
print master_df
print(master_df.dtypes)

                             country           region  fauna_count  \
0                      united_states    north_america         41.0   
1                            bermuda    north_america          0.0   
2                             mexico    north_america         17.0   
3                             belize  central_america          0.0   
4                          guatemala  central_america          3.0   
5                           honduras  central_america         23.0   
6                        el_salvador  central_america          1.0   
7                          nicaragua  central_america          8.0   
8                         costa_rica  central_america          0.0   
9                             panama  central_america          4.0   
10                           bahamas        caribbean          0.0   
11                              cuba        caribbean          1.0   
12                  turks_and_caicos        caribbean          0.0   
13                  

In [6]:
#CIA world fact book does not have population sizes for the following countries in our dataset.
#These will need to be looked up somewhere else (e.g. UN population size data) and entered manually.
master_df[master_df['pop_size'].isnull()]

Unnamed: 0,country,region,fauna_count,nextstrain_count,n_seqs_unpublished,pop_size
21,guadeloupe,caribbean,1.0,1.0,1.0,
23,martinique,caribbean,3.0,3.0,0.0,
31,french_guiana,south_america,1.0,1.0,0.0,


In [7]:
# fill in pop sizes that weren't in CIA factbook manually, using census data
# below: intuition for funky pandas indexing.
# using locate (.loc), tell it which row you want (with conditional), then say which column you're editing == value.)

master_df.loc[master_df['country'] == 'guadeloupe', 'pop_size'] = 402119.0 #source: https://www.insee.fr/fr/statistiques/1895182
master_df.loc[master_df['country'] == 'martinique', 'pop_size'] = 385551.0 #source: https://www.insee.fr/fr/statistiques/1895162
master_df.loc[master_df['country'] == 'french_guiana', 'pop_size'] = 252338.0 #source: https://www.insee.fr/fr/statistiques/1405599?geo=REG-03

In [7]:
#write out genome counts to csv
master_df.to_csv(path_or_buf = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/genome_counts.tsv', sep = '\t', columns = ['country','region','fauna_count','nextstrain_count','n_seqs_unpublished'])