# Setup

In [352]:
from __future__ import print_function, division
import pandas as pd
from datetime import datetime

# Data import

In [214]:
df = pd.read_csv("data/ebola_data.csv")

In [215]:
df.sample(3)

Unnamed: 0,Indicator,Country,Date,value
17473,"Cumulative number of confirmed, probable and s...",Nigeria,2015-12-29,20.0
6035,Cumulative number of probable Ebola cases,Sierra Leone,2015-04-09,287.0
3668,"Cumulative number of confirmed, probable and s...",Nigeria,2015-01-29,8.0


# Cleaning

## Split up Indicator

In [216]:
outcome = []
for i in range(len(df.value)):
    if 'CFR' in df.Indicator[i]:
        outcome.append('CFR')
    elif 'deaths' in df.Indicator[i]:
        outcome.append('death')
    elif 'cases' in df.Indicator[i]:
        outcome.append('cases')
    else:
        print(check)

In [217]:
status = []
for i in range(len(df.value)):
    if 'of confirmed Ebola' in df.Indicator[i]:
        status.append('confirmed')
    elif 'of probable Ebola' in df.Indicator[i]:
        status.append('probable')
    elif 'of confirmed, probable and suspected Ebola' in df.Indicator[i]:
        status.append('all')
    else:
        status.append('suspected')

In [218]:
days = []
for i in range(len(df.value)):
    if '21 days' in df.Indicator[i]:
        days.append(21)
    elif '7 days' in df.Indicator[i]:
        days.append(7)
    else:
        days.append(0)

In [219]:
df['outcome'] = outcome
df['status'] = status
df['days'] = days

In [220]:
df = df.drop('Indicator',axis=1)

In [221]:
df.sample(3)

Unnamed: 0,Country,Date,value,outcome,status,days
10044,Nigeria,2015-07-06,1.0,cases,probable,0
1990,Sierra Leone,2014-12-10,35.0,death,suspected,0
452,Guinea,2014-09-16,162.0,cases,probable,0


## Date

In [222]:
date = []
for i in range(len(df.value)):
    datetime_object = datetime.strptime(df.Date[i], '%Y-%m-%d')
    date.append(datetime_object)

In [223]:
df['date'] = date

In [224]:
df = df.drop('Date',axis=1)

In [225]:
df.sample(3)

Unnamed: 0,Country,value,outcome,status,days,date
1144,Guinea,8.0,cases,confirmed,7,2014-10-22
12525,United Kingdom,1.0,cases,confirmed,0,2015-08-21
10582,Senegal,0.0,death,probable,0,2015-07-15


## Value

In [226]:
df['value'] = df['value'].astype('int')

In [227]:
df.sample(3)

Unnamed: 0,Country,value,outcome,status,days,date
17527,Liberia 2,3,death,all,0,2016-03-23
15643,Guinea,3350,cases,confirmed,0,2015-10-27
2482,Guinea,2707,cases,all,0,2014-12-31


## Country

In [228]:
df.rename(columns={'Country': 'country'}, inplace=True)

In [229]:
df.sample(3)

Unnamed: 0,country,value,outcome,status,days,date
9285,Senegal,1,cases,confirmed,0,2015-06-19
13020,Sierra Leone,4624,cases,suspected,0,2015-09-01
8310,Italy,1,cases,confirmed,0,2015-06-01


# Africa countries

In [230]:
countries = list(df.country.unique())

In [231]:
africa_ebola = ['Guinea',
 'Liberia',
 'Sierra Leone',
 'Nigeria',
 'Senegal',
 'Mali',
 'Liberia 2',
 'Guinea 2']

In [232]:
africa_lat_long = {'Guinea':[9.935430, -9.695052],
 'Liberia':[6.426983, -9.429671],
 'Sierra Leone':[8.460466, -11.779898],
 'Nigeria':[9.081746, 8.675196],
 'Senegal':[14.496320, -14.452312],
 'Mali':[17.570332, -3.996270],
 'Liberia 2':[6.426983, -9.429671],
 'Guinea 2':[9.935430, -9.695052]}

In [233]:
africa_df = df[df['country'].isin(africa_ebola)]

In [234]:
africa_df.sample(3)

Unnamed: 0,country,value,outcome,status,days,date
5614,Nigeria,20,cases,all,0,2015-03-20
14069,Nigeria,1,cases,probable,0,2015-09-21
16168,Nigeria,19,cases,confirmed,0,2015-11-11


## Deal with Liberia 2, Guinea 2 -- Drop them

In [235]:
#df["A"][(df["B"] > 50) & (df["C"] == 900)]
africa_df[(africa_df['country']=="Liberia 2") & (africa_df['outcome']== "cases")]['value'].sum()

1470

In [236]:
# drop Liberia 2 and Guinea 2
africa_df = africa_df[(africa_df['country']!="Liberia 2") & (africa_df['country']!="Guinea 2")]

In [237]:
africa_df = africa_df.reset_index(drop=True)

## To Lat Long

In [238]:
lat = []
long = []
for i in range(len(africa_df.value)):
    lat.append(africa_lat_long[africa_df.country[i]][0])
    long.append(africa_lat_long[africa_df.country[i]][1])

In [239]:
len(lat)

11368

In [240]:
africa_df['lat'] = lat
africa_df['long'] = long

In [308]:
africa_df.sample(10)

Unnamed: 0,country,value,outcome,status,days,date,lat,long
384,Nigeria,7,death,confirmed,0,2014-09-26,9.081746,8.675196
4415,Nigeria,19,cases,confirmed,0,2015-09-09,9.081746,8.675196
3399,Nigeria,7,death,confirmed,0,2015-07-01,9.081746,8.675196
3651,Liberia,3151,cases,confirmed,0,2015-07-20,6.426983,-9.429671
2594,Senegal,0,cases,probable,0,2015-05-07,14.49632,-14.452312
4541,Nigeria,7,death,confirmed,0,2015-09-17,9.081746,8.675196
4445,Sierra Leone,8703,cases,confirmed,0,2015-09-11,8.460466,-11.779898
3822,Mali,1,cases,probable,0,2015-07-30,17.570332,-3.99627
363,Sierra Leone,34,death,probable,0,2014-09-24,8.460466,-11.779898
2454,Senegal,0,death,confirmed,0,2015-04-27,14.49632,-14.452312


# What are we counting?

Confirmed and probable for cases and death - 2 data points per country/date

In [242]:
africa_df['status'].unique()

array(['confirmed', 'probable', 'suspected', 'all'], dtype=object)

In [243]:
africa_df = africa_df[(africa_df['status']=="confirmed") | (africa_df['status']=="probable")]

In [247]:
africa_df = africa_df.reset_index(drop=True)

In [311]:
africa_df.sample(10)

Unnamed: 0,country,value,outcome,status,days,date,lat,long
3831,Sierra Leone,287,cases,probable,0,2015-07-31,8.460466,-11.779898
4108,Mali,1,cases,probable,0,2015-08-19,17.570332,-3.99627
4774,Guinea,2081,death,confirmed,0,2015-10-05,9.93543,-9.695052
2330,Nigeria,1,death,probable,0,2015-04-17,9.081746,8.675196
1928,Sierra Leone,287,cases,probable,0,2015-03-10,8.460466,-11.779898
3448,Sierra Leone,8673,cases,confirmed,0,2015-07-06,8.460466,-11.779898
4243,Guinea,2076,death,confirmed,0,2015-08-28,9.93543,-9.695052
2118,Mali,1,cases,probable,0,2015-03-24,17.570332,-3.99627
306,Guinea,435,death,confirmed,0,2014-09-18,9.93543,-9.695052
1145,Guinea,344,cases,confirmed,21,2015-01-07,9.93543,-9.695052


## split up days series

In [249]:
africa_df['days'].unique()

array([ 0, 21,  7])

In [255]:
#split up to different data frames

In [254]:
africa_df_0 = africa_df[(africa_df['days']==0)]
africa_df_0.shape

(5187, 8)

In [256]:
africa_df_7 = africa_df[(africa_df['days']==7)]
africa_df_7.shape

(24, 8)

In [257]:
africa_df_21 = africa_df[(africa_df['days']==21)]
africa_df_21.shape

(350, 8)

## sum up confirmed and probable

In [279]:
africa_df_0_sorting = africa_df_0[(africa_df_0['outcome']!='CFR')]

In [288]:
africa_df_0_sorting = africa_df_0_sorting.drop('days',axis=1)

In [289]:
africa_df_0_sorting.head(6)

Unnamed: 0,country,value,outcome,status,date,lat,long
0,Guinea,482,cases,confirmed,2014-08-29,9.93543,-9.695052
1,Guinea,141,cases,probable,2014-08-29,9.93543,-9.695052
2,Liberia,322,cases,confirmed,2014-08-29,6.426983,-9.429671
3,Liberia,674,cases,probable,2014-08-29,6.426983,-9.429671
4,Sierra Leone,935,cases,confirmed,2014-08-29,8.460466,-11.779898
5,Sierra Leone,37,cases,probable,2014-08-29,8.460466,-11.779898


In [291]:
africa_sorted = africa_df_0_sorting.groupby(['country','lat','long','outcome','date']).sum()

In [295]:
africa_sorted = africa_sorted.reset_index()

In [318]:
africa_sorted.to_csv("data/africa_sorted.csv")

# Make JSON sorted by each country, each bubble

In [323]:
countries = list(africa_sorted.country.unique())

In [324]:
dates = list(africa_sorted.date.unique())

In [434]:
outcomes = list(africa_sorted.outcome.unique())

In [458]:
myData = []

In [454]:
for outs in outcomes:
    for place in countries:
        newDict = {}
        for i in range(len(dates)):
            newValue = africa_sorted[(africa_sorted['country']==place)&(africa_sorted['outcome']==outs)
                      &(africa_sorted['date']==dates[i])]
            if newValue.shape[0] == 0:
                newValue = 0
            else:
                store = newValue
                newValue = int(newValue.iloc[0,5])
            newDate = str(dates[i])[:10]
            newDict[newDate] = newValue
        newDict['country'] = place
        newDict['lat'] = float(store.iloc[0,1])
        newDict['long'] = float(store.iloc[0,2])
        newDict['outcome'] = str(store.iloc[0,3])
        myData.append(newDict)

In [457]:
# patient zero
# https://www.livescience.com/48527-ebola-toddler-patient-zero.html
newDict = {}
newDict['country'] = 'Guinea'
newDict['lat'] = 8.615048
newDict['long'] = -10.061007
newDict['outcome'] = "patient zero"
newDict['2013-12-06'] = 1
myData.append(newDict)

In [456]:
import json
with open('data/ebolaData.txt', 'w') as outfile:
    json.dump(myData, outfile)

# Make dataframe for data

In [474]:
strDates = []
for i in range(len(dates)):
    a = str(dates[i])[:10]
    strDates.append(a)

In [476]:
columns = ['country','lat','long','outcome'] + strDates

In [490]:
df_new = pd.DataFrame(columns = columns)

In [491]:
for outs in outcomes:
    for place in countries:
        newList = []
        newList1 = []
        for i in range(len(dates)):
            newValue = africa_sorted[(africa_sorted['country']==place)&(africa_sorted['outcome']==outs)
                      &(africa_sorted['date']==dates[i])]
            if newValue.shape[0] == 0:
                newValue = 0
            else:
                store = newValue
                newValue = int(newValue.iloc[0,5])
            newList.append(newValue)
        newList1.append(place)
        newList1.append(float(store.iloc[0,1]))
        newList1.append(float(store.iloc[0,2]))
        newList1.append(str(store.iloc[0,3]))
        newList = newList1 + newList
        
        df_new = df_new.append(pd.Series(newList, index=columns), ignore_index=True)
        

In [492]:
df_new

Unnamed: 0,country,lat,long,outcome,2014-08-29,2014-09-05,2014-09-08,2014-09-12,2014-09-16,2014-09-18,...,2015-12-09,2015-12-10,2015-12-11,2015-12-15,2015-12-16,2015-12-17,2015-12-22,2015-12-23,2015-12-29,2016-03-23
0,Guinea,9.93543,-9.695052,cases,623,756,815,829,905,912,...,3804,3804,3804,3804,3804,3804,3804,3804,3804,3804
1,Liberia,6.426983,-9.429671,cases,996,1502,1603,1628,1868,2045,...,5030,5030,5030,5030,5030,5030,5030,5030,5030,5030
2,Mali,17.570332,-3.99627,cases,0,0,0,0,0,0,...,8,8,8,8,8,8,8,8,8,8
3,Nigeria,9.081746,8.675196,cases,16,19,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
4,Senegal,14.49632,-14.452312,cases,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,Sierra Leone,8.460466,-11.779898,cases,972,1183,1271,1324,1501,1550,...,8991,8991,8991,8991,8991,8991,8991,8991,8991,8991
6,Guinea,9.93543,-9.695052,death,428,514,551,553,591,596,...,2536,2536,2536,2536,2536,2536,2536,2536,2536,2536
7,Liberia,6.426983,-9.429671,death,526,832,928,926,1035,1149,...,0,0,0,0,0,0,0,0,0,0
8,Mali,17.570332,-3.99627,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Nigeria,9.081746,8.675196,death,7,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8


In [493]:
df_new.to_csv("data/africa_d3.csv")