In [1]:
import pandas as pd
import numpy as np

In [2]:
# file paths for data that will need to be merged.

indexed_countries_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/indexed-countries.tsv"
pop_sizes_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/predictors/cia-world-fact-book-popsizes.txt"
paho_case_counts_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/metadata/paho-cumul-case-2017-07-13.xls"
genome_counts_path = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/all-zika-fauna-nextstrain-counts-2017-07-05.tsv"

In [3]:
master_df = pd.read_table(indexed_countries_path)

In [4]:
#read in from other files that have information that we want, but from more countries than the set we care about.
#merge dataframes on the indexed countries df to ensure we only keep records from countries of interest.


#country population sizes
pop_size_df = pd.read_table(pop_sizes_path)
del pop_size_df['rank'] #doesn't matter, part of world fact book export

#numbers of genomes available, either on fauna, or on nextstrain (sometimes less than what's on fauna)
genome_count_df = pd.read_table(genome_counts_path)

#merging
master_df = pd.merge(master_df, genome_count_df, on='country', how='left')
#remove NaNs from genome counts > replace with zeros
master_df.fillna(value=0.0, inplace= True)

master_df = pd.merge(master_df, pop_size_df, on='country', how='left')


In [5]:
print master_df
print(master_df.dtypes)

    xml_index                           country           region  fauna_count  \
0           1                     united_states    north_america         41.0   
1           2                           bermuda    north_america          0.0   
2           3                            mexico    north_america         17.0   
3           4                            belize  central_america          0.0   
4           5                         guatemala  central_america          3.0   
5           6                          honduras  central_america         23.0   
6           7                       el_salvador  central_america          1.0   
7           8                         nicaragua  central_america          8.0   
8           9                        costa_rica  central_america          0.0   
9          10                            panama  central_america          4.0   
10         11                           bahamas        caribbean          0.0   
11         12               

In [8]:
#CIA world fact book does not have population sizes for the following countries in our dataset.
#These will need to be looked up somewhere else (e.g. UN population size data) and entered manually.
master_df[master_df['pop_size'].isnull()]

Unnamed: 0,xml_index,country,region,pop_size,fauna_count,nextstrain_count,n_seqs_unpublished
24,25,bonaire_sint_eustatius_saba,caribbean,,,,
28,29,guadeloupe,caribbean,,1.0,1.0,1.0
30,31,martinique,caribbean,,3.0,3.0,0.0
38,39,french_guiana,south_america,,1.0,1.0,0.0


In [6]:
# fill in pop sizes that weren't in CIA factbook manually, using census data
# below: intuition for funky pandas indexing.
# using locate (.loc), tell it which row you want (with conditional), then say which column you're editing == value.)

master_df.loc[master_df['country'] == 'bonaire_sint_eustatius_saba', 'pop_size'] = 24593.0 #source: http://statline.cbs.nl/StatWeb/publication/?DM=SLNL&PA=80539ned&D1=0-1,9-10&D2=a&D3=a&HDR=T&STB=G1,G2&CHARTTYPE=1&VW=T
master_df.loc[master_df['country'] == 'guadeloupe', 'pop_size'] = 402119.0 #source: https://www.insee.fr/fr/statistiques/1895182
master_df.loc[master_df['country'] == 'martinique', 'pop_size'] = 385551.0 #source: https://www.insee.fr/fr/statistiques/1895162
master_df.loc[master_df['country'] == 'french_guiana', 'pop_size'] = 252338.0 #source: https://www.insee.fr/fr/statistiques/1405599?geo=REG-03

In [8]:
#mask out all countries except those that have less than 50,000 residents (which we might drop from the analysis)
master_df.where(master_df['pop_size']< 50000)

Unnamed: 0,xml_index,country,region,fauna_count,nextstrain_count,n_seqs_unpublished,pop_size
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,
7,,,,,,,
8,,,,,,,
9,,,,,,,


In [15]:
print master_df.loc[master_df['country'] == 'saint_kitts_and_nevis']
print master_df.loc[master_df['country'] == 'turks_and_caicos']

    xml_index                country     region  fauna_count  \
25         26  saint_kitts_and_nevis  caribbean          0.0   

    nextstrain_count  n_seqs_unpublished  pop_size  
25               0.0                 0.0   52329.0  
    xml_index           country     region  fauna_count  nextstrain_count  \
12         13  turks_and_caicos  caribbean          0.0               0.0   

    n_seqs_unpublished  pop_size  
12                 0.0   51430.0  


In [7]:
#write out genome counts to csv
master_df.to_csv(path_or_buf = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/genome_counts.tsv', sep = '\t', columns = ['country','region','fauna_count','nextstrain_count','n_seqs_unpublished'])