In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager as fm
fm.findfont('Helvetica')

'/System/Library/Fonts/Helvetica.ttc'

In [17]:
linelist = pd.read_csv("../data/epi-data/epi-surveillance-data-cases-only-abridged.csv")
linelist

Unnamed: 0,StateID,County,SymptomOnsetDate,Serum1Collected,FirstSerumCollectionDate,DateOfDiagResults,CollectionToResultLag
0,16VI0001,Saint Croix,2016-01-03,True,2016-01-08,2016-01-27,19.0
1,16VI0002,Saint Croix,2016-01-06,False,,,
2,16VI0017,Saint Croix,2016-01-10,True,2016-01-26,2016-02-22,27.0
3,16VI1167,Saint Croix,2016-01-10,True,2016-07-15,2016-08-22,38.0
4,16VI0024,Saint Croix,2016-01-24,True,2016-01-29,2016-02-18,20.0
...,...,...,...,...,...,...,...
1308,17VI26305,Saint Thomas,2017-04-25,False,,,
1309,17VI26382,Saint Thomas,2017-04-25,False,,,
1310,17VI26353,Saint Thomas,2017-05-16,False,,,
1311,17VI26359,Saint Thomas,2017-05-18,False,,,


In [18]:
#make an index that goes day by day from the first reported symptom onset date to the last one.
#these will be our "bins" for counting up the number of new cases per day.
day_index = pd.date_range(linelist["SymptomOnsetDate"].min(), linelist["SymptomOnsetDate"].max(), freq='d')

print(("There are {} days between the first case and last case in our dataset.").format(len(day_index)))
print(("The first case was recorded on {}.").format(min(day_index)))
print(("The last case was recorded on {}.").format(max(day_index)))

There are 507 days between the first case and last case in our dataset.
The first case was recorded on 2016-01-03 00:00:00.
The last case was recorded on 2017-05-23 00:00:00.


In [35]:
#to make the epi curve we need to transform the linelist into counts of number of cases that happened on a day
#the df below has a record for every date that had a recorded case, but DOES NOT have a record for a date where no
#cases were detected. Which means that this df doesn't contain every date in the range the way the index df above does.
case_counts_per_day = pd.DataFrame(linelist["SymptomOnsetDate"].value_counts().sort_index())

In [42]:
#then, make those above case counts contiguous with the full range of dates between the first and last case.
#this in essence fits the daily case counts into the date range index, filling in 0's for dates that had no cases.

empty_curve = case_counts_per_day.reindex(day_index).fillna(0) #make df for every day in range, set case counts at 0
all_islands_epi_curve_daily = empty_curve.add(case_counts_per_day).fillna(0) #add case counts from actual data if date has cases
all_islands_epi_curve_daily = all_islands_epi_curve_daily.rename(columns={"SymptomOnsetDate":"caseCount"})
print(all_islands_epi_curve_daily)


            caseCount
2016-01-03        1.0
2016-01-04        0.0
2016-01-05        0.0
2016-01-06        1.0
2016-01-07        0.0
...               ...
2017-05-19        1.0
2017-05-20        0.0
2017-05-21        0.0
2017-05-22        0.0
2017-05-23        1.0

[507 rows x 1 columns]


In [43]:
#sometimes rather than having case counts per day we want case counts per week,
#so here I've just grouped the case counts from days into weeks.
#the date represents the END of the week. So for the case count represented by 2016-01-10,
#that case count represents all cases that happened between 2016-01-04 up to end of day 2016-01-10.
#in this case there's 3, because there was a case on 2016-01-06 and two on 2016-01-10.
all_islands_epi_curve_weekly = all_islands_epi_curve_daily.resample('W', closed="right").sum()
all_islands_epi_curve_weekly

Unnamed: 0,caseCount
2016-01-03,1.0
2016-01-10,3.0
2016-01-17,0.0
2016-01-24,2.0
2016-01-31,3.0
...,...
2017-04-30,4.0
2017-05-07,3.0
2017-05-14,2.0
2017-05-21,4.0


In [46]:
all_islands_epi_curve_daily.to_csv("../data/epi-data/daily_incidence_all_islands.csv", index_label="date")
all_islands_epi_curve_weekly.to_csv("../data/epi-data/weekly_incidence_all_islands.csv", index_label="EndOfWeekDate")

In [41]:
# Going to put all of the above processes into a function 
# since I need to do this again for each of the three islands on their own.

def linelist_to_epicurve(linelist, date_column_name, frequency):
    #get index object for every day between first and last cases in linelist
    day_index = pd.date_range(linelist[date_column_name].min(), linelist[date_column_name].max(), freq="d")
    #get total case counts for each day that has one or more reported cases
    case_counts_per_day = pd.DataFrame(linelist[date_column_name].value_counts().sort_index())
    #make a epi curve based off of the index object. Have a column for every day in the range, set cases at 0.
    empty_curve = case_counts_per_day.reindex(day_index).fillna(0)
    #then add the case counts to that df
    epi_curve = empty_curve.add(case_counts_per_day).fillna(0).sum()
    
    #don't change a thing if we want daily counts, but if we want weekly, then group the counts by week
    if frequency == "w":
        epi_curve = epi_curve.resample('W', closed="right").sum()
    print(epi_curve)
    return epi_curve

In [42]:
all_islands_linelist = pd.read_csv("../data/epi-data/epi-surveillance-data-cases-only-abridged.csv")

trial = linelist_to_epicurve(all_islands_linelist, "SymptomOnsetDate", "w")
trial

            SymptomOnsetDate
2016-01-03               1.0
2016-01-10               2.0
2016-01-17               0.0
2016-01-24               1.0
2016-01-31               0.0
...                      ...
2017-04-30               0.0
2017-05-07               0.0
2017-05-14               1.0
2017-05-21               0.0
2017-05-28               0.0

[74 rows x 1 columns]


Unnamed: 0,SymptomOnsetDate
2016-01-03,1.0
2016-01-10,2.0
2016-01-17,0.0
2016-01-24,1.0
2016-01-31,0.0
...,...
2017-04-30,0.0
2017-05-07,0.0
2017-05-14,1.0
2017-05-21,0.0
