In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from pprint import pprint

In [22]:
genotype_frequencies = pd.read_csv('../data/frequencies/south_america_frequencies.csv', index_col=0)
genotypes = genotype_frequencies.columns.values
serotypes = ['DENV1', 'DENV2', 'DENV3', 'DENV4']
constituent_genotypes = { serotype: [genotype for genotype in genotypes if genotype.startswith(serotype)]
                        for serotype in serotypes}
tmp_serotype_frequencies = {serotype: genotype_frequencies[constituent_genotypes[serotype]] 
                        for serotype in serotypes}
serotype_frequencies = {}
for serotype, constituent_frequencies in tmp_serotype_frequencies.items():
    serotype_frequencies[serotype] = constituent_frequencies.sum(axis=1)
serotype_frequencies = pd.DataFrame(serotype_frequencies)

In [50]:
seasons = [(2004.5,2005.5), (2005.5,2006.5), (2006.5,2007.5), (2007.5,2008.5),(2008.5,2009.5),(2009.5,2010.5)]
def bin_seasons(df, seasons=seasons):
    def find_season(date, seasons=seasons):
        season = np.nan
        for (start,end) in seasons:
            if date >= start and date < end:
                season = (start,end)
                break
        return season
    df['season'] = [find_season(date, seasons) for date in df.index.values]
    df.dropna(how='any', inplace=True)
    df = df.groupby('season').agg('mean')
    return df

serotype_frequencies = bin_seasons(serotype_frequencies, seasons)
genotype_frequencies = bin_seasons(genotype_frequencies, seasons)

In [84]:
incidence = pd.read_csv('../data/frequencies/nicaragua_incidence_gordon2013.txt', sep='\t', index_col=0)
incidence.dropna(inplace=True)
incidence.index = seasons
incidence['Primary Incidence per 1000 PY'] = incidence['Primary DENV infections'] / (incidence['Primary PY at Risk']/1000.)
incidence['Secondary Incidence per 1000 PY'] = incidence['Secondary DENV Infections'] / (incidence['Secondary PY at Risk']/1000.)
incidence['DHF Incidence per 100 Infections'] = incidence['DHF counts'] / ((incidence['Primary DENV infections']+incidence['Secondary DENV Infections'])/100)

In [96]:
serotype_primary_counts = serotype_frequencies.multiply(incidence['Primary DENV infections'], axis=0)
serotype_primary_counts['PY at risk'] = incidence['Primary PY at Risk']
serotype_primary_incidence = serotype_primary_counts[serotypes].divide(serotype_primary_counts['PY at risk'], axis=0)
serotype_primary_incidence['PY at risk'] = incidence['Primary PY at Risk']

In [97]:
serotype_secondary_counts = serotype_frequencies.multiply(incidence['Secondary DENV Infections'], axis=0)
serotype_secondary_counts['PY at risk'] = incidence['Secondary PY at Risk']
serotype_secondary_incidence = serotype_secondary_counts[serotypes].divide(serotype_secondary_counts['PY at risk'], axis=0)
serotype_secondary_incidence['PY at risk'] = incidence['Secondary PY at Risk']

In [99]:
def calc_cumulative_incidence(case_counts, PY_at_risk, first_season, last_season):
    start = first_season[0]
    end = last_season[1]
    
    cumulative_cases = 0.
    for idx, count in case_counts.iteritems():
        if idx[0] >= start and idx[0] < end:
            cumulative_cases += float(count)
    
    try:
        assert isinstance(PY_at_risk, float)
    except:
        '''
        This isn't an exactly closed population, 
        but if the pop size varies by <= 20%, 
        go ahead and just use the average person time at risk.
        '''
        assert max(PY_at_risk)*.8 <= min(PY_at_risk)
        PY_at_risk = float(PY_at_risk.mean())

    return cumulative_cases / PY_at_risk