# Setup

In [149]:
from __future__ import print_function, division
import pandas as pd
from datetime import datetime

# Data import

In [221]:
df = pd.read_csv("data/ebola_data.csv")

In [222]:
df.sample(3)

Unnamed: 0,Indicator,Country,Date,value
3345,Cumulative number of confirmed Ebola cases,United States of America,2015-01-22,4.0
7927,Cumulative number of suspected Ebola cases,Spain,2015-05-22,0.0
15569,Cumulative number of probable Ebola deaths,Sierra Leone,2015-10-26,208.0


# Cleaning

## Split up Indicator

In [223]:
outcome = []
for i in range(len(df.value)):
    if 'CFR' in df.Indicator[i]:
        outcome.append('CFR')
    elif 'deaths' in df.Indicator[i]:
        outcome.append('death')
    elif 'cases' in df.Indicator[i]:
        outcome.append('cases')
    else:
        print(check)

In [224]:
status = []
for i in range(len(df.value)):
    if 'of confirmed Ebola' in df.Indicator[i]:
        status.append('confirmed')
    elif 'of probable Ebola' in df.Indicator[i]:
        status.append('probable')
    elif 'of confirmed, probable and suspected Ebola' in df.Indicator[i]:
        status.append('all')
    else:
        status.append('suspected')

In [225]:
days = []
for i in range(len(df.value)):
    if '21 days' in df.Indicator[i]:
        days.append(21)
    elif '7 days' in df.Indicator[i]:
        days.append(7)
    else:
        days.append(0)

In [226]:
df['outcome'] = outcome
df['status'] = status
df['days'] = days

In [227]:
df = df.drop('Indicator',axis=1)

In [228]:
df.sample(3)

Unnamed: 0,Country,Date,value,outcome,status,days
3825,Sierra Leone,2015-02-03,3276.0,death,all,0
4233,Spain,2015-02-12,1.0,cases,all,0
8605,Senegal,2015-06-05,0.0,cases,suspected,0


## Date

In [229]:
date = []
for i in range(len(df.value)):
    datetime_object = datetime.strptime(df.Date[i], '%Y-%m-%d')
    date.append(datetime_object)

In [230]:
df['date'] = date

In [231]:
df = df.drop('Date',axis=1)

In [232]:
df.sample(3)

Unnamed: 0,Country,value,outcome,status,days,date
5113,Spain,0.0,cases,suspected,0,2015-03-06
514,Liberia,472.0,death,probable,0,2014-09-16
13016,Guinea,3792.0,cases,all,0,2015-09-01


## Value

In [233]:
df['value'] = df['value'].astype('int')

In [234]:
df.sample(3)

Unnamed: 0,Country,value,outcome,status,days,date
9161,United States of America,0,cases,suspected,0,2015-06-17
7241,United Kingdom,1,cases,all,0,2015-05-08
55,Liberia,694,death,all,0,2014-08-29


## Country

In [235]:
df.rename(columns={'Country': 'country'}, inplace=True)

In [236]:
df.sample(3)

Unnamed: 0,country,value,outcome,status,days,date
1680,Spain,0,death,probable,0,2014-11-19
11287,United Kingdom,1,cases,all,0,2015-07-29
5625,United States of America,0,cases,suspected,0,2015-03-20


# Africa countries

In [237]:
countries = list(df.country.unique())

In [238]:
africa_ebola = ['Guinea',
 'Liberia',
 'Sierra Leone',
 'Nigeria',
 'Senegal',
 'Mali',
 'Liberia 2',
 'Guinea 2']

In [239]:
africa_lat_long = {'Guinea':[9.935430, -9.695052],
 'Liberia':[6.426983, -9.429671],
 'Sierra Leone':[8.460466, -11.779898],
 'Nigeria':[9.081746, 8.675196],
 'Senegal':[14.496320, -14.452312],
 'Mali':[17.570332, -3.996270],
 'Liberia 2':[6.426983, -9.429671],
 'Guinea 2':[9.935430, -9.695052]}

In [240]:
africa_df = df[df['country'].isin(africa_ebola)]

In [241]:
africa_df.sample(3)

Unnamed: 0,country,value,outcome,status,days,date
15969,Senegal,0,death,confirmed,0,2015-11-05
6990,Mali,0,cases,suspected,0,2015-05-04
14509,Senegal,1,cases,confirmed,0,2015-09-29


## Deal with Liberia 2, Guinea 2 -- Drop them

In [242]:
#df["A"][(df["B"] > 50) & (df["C"] == 900)]
africa_df[(africa_df['country']=="Liberia 2") & (africa_df['outcome']== "cases")]['value'].sum()

1470

In [243]:
# drop Liberia 2 and Guinea 2
africa_df = africa_df[(africa_df['country']!="Liberia 2") & (africa_df['country']!="Guinea 2")]

In [244]:
africa_df = africa_df.reset_index(drop=True)

## To Lat Long

In [245]:
lat = []
long = []
for i in range(len(africa_df.value)):
    lat.append(africa_lat_long[africa_df.country[i]][0])
    long.append(africa_lat_long[africa_df.country[i]][1])

In [246]:
len(lat)

11368

In [247]:
africa_df['lat'] = lat
africa_df['long'] = long

In [248]:
africa_df.sample(10)

Unnamed: 0,country,value,outcome,status,days,date,lat,long
6379,Nigeria,19,cases,confirmed,0,2015-06-12,9.081746,8.675196
7160,Guinea,2055,death,confirmed,0,2015-07-09,9.93543,-9.695052
4835,Senegal,0,death,all,0,2015-04-21,14.49632,-14.452312
3527,Nigeria,7,death,confirmed,0,2015-02-19,9.081746,8.675196
4799,Guinea,2358,death,all,0,2015-04-21,9.93543,-9.695052
1935,Nigeria,7,death,confirmed,0,2014-12-23,9.081746,8.675196
5513,Guinea,11,cases,suspected,0,2015-05-15,9.93543,-9.695052
3786,Senegal,0,death,all,0,2015-03-02,14.49632,-14.452312
5418,Senegal,0,death,confirmed,0,2015-05-12,14.49632,-14.452312
1959,Nigeria,19,cases,confirmed,0,2014-12-24,9.081746,8.675196


# What are we counting?

Confirmed and probable for cases and death - 2 data points per country/date

In [249]:
africa_df['status'].unique()

array(['confirmed', 'probable', 'suspected', 'all'], dtype=object)

In [250]:
africa_df = africa_df[(africa_df['status']=="confirmed") | (africa_df['status']=="probable")]

In [251]:
africa_df = africa_df.reset_index(drop=True)

In [252]:
africa_df.sample(10)

Unnamed: 0,country,value,outcome,status,days,date,lat,long
4292,Nigeria,1,cases,probable,0,2015-09-01,9.081746,8.675196
4979,Guinea,453,cases,probable,0,2015-10-23,9.93543,-9.695052
1620,Sierra Leone,8138,cases,confirmed,0,2015-02-12,8.460466,-11.779898
2718,Nigeria,7,death,confirmed,0,2015-05-15,9.081746,8.675196
1531,Guinea,347,death,probable,0,2015-02-05,9.93543,-9.695052
4150,Nigeria,1,cases,probable,0,2015-08-21,9.081746,8.675196
12,Sierra Leone,308,cases,confirmed,21,2014-08-29,8.460466,-11.779898
2047,Guinea,1829,death,confirmed,0,2015-03-18,9.93543,-9.695052
5128,Nigeria,1,death,probable,0,2015-11-06,9.081746,8.675196
1405,Guinea,1578,death,confirmed,0,2015-01-28,9.93543,-9.695052


## split up days series

In [253]:
africa_df['days'].unique()

array([ 0, 21,  7])

In [254]:
#split up to different data frames

In [255]:
africa_df_0 = africa_df[(africa_df['days']==0)]
africa_df_0.shape

(5187, 8)

In [256]:
africa_df_7 = africa_df[(africa_df['days']==7)]
africa_df_7.shape

(24, 8)

In [257]:
africa_df_21 = africa_df[(africa_df['days']==21)]
africa_df_21.shape

(350, 8)

## sum up confirmed and probable

In [258]:
africa_df_0_sorting = africa_df_0[(africa_df_0['outcome']!='CFR')]

In [259]:
africa_df_0_sorting = africa_df_0_sorting.drop('days',axis=1)

In [260]:
africa_df_0_sorting.head(6)

Unnamed: 0,country,value,outcome,status,date,lat,long
0,Guinea,482,cases,confirmed,2014-08-29,9.93543,-9.695052
1,Guinea,141,cases,probable,2014-08-29,9.93543,-9.695052
2,Liberia,322,cases,confirmed,2014-08-29,6.426983,-9.429671
3,Liberia,674,cases,probable,2014-08-29,6.426983,-9.429671
4,Sierra Leone,935,cases,confirmed,2014-08-29,8.460466,-11.779898
5,Sierra Leone,37,cases,probable,2014-08-29,8.460466,-11.779898


In [261]:
africa_sorted = africa_df_0_sorting.groupby(['country','lat','long','outcome','date']).sum()

In [262]:
africa_sorted = africa_sorted.reset_index()

In [263]:
africa_sorted.to_csv("data/africa_sorted.csv")

# Add patient zero stuff

In [264]:
africa_sorted.sample(5)

Unnamed: 0,country,lat,long,outcome,date,value
394,Guinea,9.93543,-9.695052,death,2015-06-10,2437
516,Liberia,6.426983,-9.429671,cases,2014-08-29,996
2266,Sierra Leone,8.460466,-11.779898,cases,2015-09-10,8989
2310,Sierra Leone,8.460466,-11.779898,cases,2015-12-02,8991
59,Guinea,9.93543,-9.695052,cases,2015-02-02,2935


In [265]:
cols = list(africa_sorted.columns)

In [266]:
datetime_object = datetime.strptime("2013-12-06", '%Y-%m-%d')
pz = ["Meliandou", 8.616038, -10.061179, "death", datetime_object, 1]
patient_zero = pd.DataFrame([pz], columns=cols)
start = patient_zero

In [267]:
datetime_object = datetime.strptime("2013-12-13", '%Y-%m-%d')
pz = ["Meliandou", 8.616038, -10.061179, "death", datetime_object, 2]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [268]:
datetime_object = datetime.strptime("2013-12-29", '%Y-%m-%d')
pz = ["Meliandou", 8.616038, -10.061179, "death", datetime_object, 3]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [269]:
datetime_object = datetime.strptime("2014-01-01", '%Y-%m-%d')
pz = ["Meliandou", 8.616038, -10.061179, "death", datetime_object, 4]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [270]:
datetime_object = datetime.strptime("2014-02-02", '%Y-%m-%d')
pz = ["Meliandou", 8.616038, -10.061179, "death", datetime_object, 6]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [271]:
datetime_object = datetime.strptime("2014-03-12", '%Y-%m-%d')
pz = ["Gbandou", 8.526113, -10.288549, "death", datetime_object, 3]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [272]:
datetime_object = datetime.strptime("2014-02-11", '%Y-%m-%d')
pz = ["Dandu Pombo", 9.032877, -9.953984, "death", datetime_object, 1]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [273]:
datetime_object = datetime.strptime("2014-02-28", '%Y-%m-%d')
pz = ["Dandu Pombo", 9.032877, -9.953984, "death", datetime_object, 4]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [274]:
datetime_object = datetime.strptime("2014-03-31", '%Y-%m-%d')
pz = ["Dandu Pombo", 9.032877, -9.953984, "death", datetime_object, 6]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [275]:
datetime_object = datetime.strptime("2014-01-26", '%Y-%m-%d')
pz = ["Dawa", 9.032877, -9.953984, "death", datetime_object, 1]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [276]:
datetime_object = datetime.strptime("2014-02-11", '%Y-%m-%d')
pz = ["Dawa", 9.032877, -9.953984, "death", datetime_object, 3]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [277]:
datetime_object = datetime.strptime("2014-03-27", '%Y-%m-%d')
pz = ["Dawa", 9.032877, -9.953984, "death", datetime_object, 8]
patient_zero = pd.DataFrame([pz], columns=cols)
start = start.append(patient_zero)

In [278]:
start = start.append(africa_sorted)

In [279]:
start = start.reset_index(drop=True)

In [280]:
start.sample(5)

Unnamed: 0,country,lat,long,outcome,date,value
1377,Nigeria,9.081746,8.675196,death,2015-02-17,8
624,Liberia,6.426983,-9.429671,cases,2015-04-14,5030
338,Guinea,9.93543,-9.695052,death,2015-02-17,2057
896,Mali,17.570332,-3.99627,cases,2015-04-28,8
48,Guinea,9.93543,-9.695052,cases,2014-12-30,2672


In [292]:
africa_sorted = start

# Make JSON sorted by each country, each bubble

In [299]:
countries = list(africa_sorted.country.unique())

In [300]:
dates = list(africa_sorted.date.unique())

In [301]:
outcomes = list(africa_sorted.outcome.unique())

In [302]:
myData = []

In [285]:
for outs in outcomes:
    for place in countries:
        newDict = {}
        for i in range(len(dates)):
            newValue = africa_sorted[(africa_sorted['country']==place)&(africa_sorted['outcome']==outs)
                      &(africa_sorted['date']==dates[i])]
            if newValue.shape[0] == 0:
                newValue = 0
            else:
                store = newValue
                newValue = int(newValue.iloc[0,5])
            newDate = str(dates[i])[:10]
            newDict[newDate] = newValue
        newDict['country'] = place
        newDict['lat'] = float(store.iloc[0,1])
        newDict['long'] = float(store.iloc[0,2])
        newDict['outcome'] = str(store.iloc[0,3])
        myData.append(newDict)

In [457]:
# patient zero
# https://www.livescience.com/48527-ebola-toddler-patient-zero.html
newDict = {}
newDict['country'] = 'Guinea'
newDict['lat'] = 8.615048
newDict['long'] = -10.061007
newDict['outcome'] = "patient zero"
newDict['2013-12-06'] = 1
myData.append(newDict)

In [456]:
import json
with open('data/ebolaData.txt', 'w') as outfile:
    json.dump(myData, outfile)

# Make dataframe for data

In [303]:
strDates = []
for i in range(len(dates)):
    a = str(dates[i])[:10]
    strDates.append(a)

In [304]:
columns = ['country','lat','long','outcome'] + strDates

In [305]:
df_new = pd.DataFrame(columns = columns)

In [306]:
for outs in outcomes:
    for place in countries:
        newList = []
        newList1 = []
        for i in range(len(dates)):
            newValue = africa_sorted[(africa_sorted['country']==place)&(africa_sorted['outcome']==outs)
                      &(africa_sorted['date']==dates[i])]
            if newValue.shape[0] == 0:
                newValue = 0
            else:
                store = newValue
                newValue = int(newValue.iloc[0,5])
            newList.append(newValue)
        newList1.append(place)
        newList1.append(float(store.iloc[0,1]))
        newList1.append(float(store.iloc[0,2]))
        newList1.append(str(store.iloc[0,3]))
        newList = newList1 + newList
        
        df_new = df_new.append(pd.Series(newList, index=columns), ignore_index=True)
        

In [307]:
df_new.describe()

Unnamed: 0,lat,long
count,20.0,20.0
mean,10.049616,-8.436665
std,3.263929,6.408536
min,6.426983,-14.452312
25%,8.460466,-11.779898
50%,8.824457,-10.007582
75%,9.93543,-9.429671
max,17.570332,8.675196


In [308]:
df_new

Unnamed: 0,country,lat,long,outcome,2013-12-06,2013-12-13,2013-12-29,2014-01-01,2014-02-02,2014-03-12,...,2015-12-09,2015-12-10,2015-12-11,2015-12-15,2015-12-16,2015-12-17,2015-12-22,2015-12-23,2015-12-29,2016-03-23
0,Meliandou,8.616038,-10.061179,death,1,2,3,4,6,0,...,0,0,0,0,0,0,0,0,0,0
1,Gbandou,8.526113,-10.288549,death,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2,Dandu Pombo,9.032877,-9.953984,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Dawa,9.032877,-9.953984,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Guinea,9.93543,-9.695052,death,0,0,0,0,0,0,...,2536,2536,2536,2536,2536,2536,2536,2536,2536,2536
5,Liberia,6.426983,-9.429671,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Mali,17.570332,-3.99627,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Nigeria,9.081746,8.675196,death,0,0,0,0,0,0,...,8,8,8,8,8,8,8,8,8,8
8,Senegal,14.49632,-14.452312,death,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Sierra Leone,8.460466,-11.779898,death,0,0,0,0,0,0,...,3797,3797,3797,3797,3797,3797,3797,3797,3797,3797


In [309]:
df_new.to_csv("data/patient_zero.csv")