# COVID-19 – Data Analysis


In [1]:
# import Pkg

# Pkg.add("CSV")
# Pkg.add("HTTP")
# Pkg.add("Plots")
# Pkg.add("Interact")
# Pkg.add("PlotlyJS")
# Pkg.add("DataFrames")
# Pkg.add("DataStructures")



## Part 1: Visualizing the data over time

------


In [2]:
using HTTP

case_csv = HTTP.get(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
).body;


### 1. Extract the data and country names from the CSV file as we did in class

Call the country names `all_countries`


In [3]:
using CSV
using DataFrames

case_data = CSV.read(case_csv);

rename!(
  case_data,
  "Province/State" => "province",
  "Country/Region" => "country"
)

all_countries = unique(case_data.country);


In [4]:
using Plots
using DataStructures
plotly()

@show cur_counter = counter(case_data.country)

@show cur_counter = filter(
  cur_tuple -> cur_tuple[2] > 1, cur_counter
)

@show cur_counter = OrderedDict(
  reverse(sort(collect(cur_counter), by=x->x[2]))
)

bar(cur_counter, label="", title="Province Counts", ylabel="Province Count", xlabel="Country")


┌ Info: Precompiling Plots [91a5bcdd-55d7-5caf-9e0b-520d859cae80]
└ @ Base loading.jl:1260


cur_counter = counter(case_data.country) = Accumulator(Peru => 1, Indonesia => 1, Gabon => 1, North Macedonia => 1, Bangladesh => 1, Kosovo => 1, Tajikistan => 1, Ethiopia => 1, Dominican Republic => 1, Vietnam => 1, South Sudan => 1, Morocco => 1, Libya => 1, US => 1, Sierra Leone => 1, Serbia => 1, Malaysia => 1, Mali => 1, West Bank and Gaza => 1, Western Sahara => 1, Russia => 1, Mongolia => 1, Tunisia => 1, Kuwait => 1, Eswatini => 1, Cuba => 1, Czechia => 1, Yemen => 1, Georgia => 1, Costa Rica => 1, Uganda => 1, United Arab Emirates => 1, Venezuela => 1, Luxembourg => 1, Eritrea => 1, Croatia => 1, Mozambique => 1, Liechtenstein => 1, Montenegro => 1, Saint Kitts and Nevis => 1, Angola => 1, Germany => 1, Andorra => 1, Honduras => 1, Holy See => 1, Bolivia => 1, Bahrain => 1, Austria => 1, Ukraine => 1, Mauritania => 1, Saudi Arabia => 1, Malta => 1, Switzerland => 1, Kazakhstan => 1, Monaco => 1, Mexico => 1, Laos => 1, Niger => 1, Argentina => 1, Benin => 1, Barbados => 1, Tri


------

### 2. Make a `Vector` called `countries` with a subset of countries that we wish to plot 

Say China, Japan, South Korea, US, United Kingdom, France, Germany. Be careful to check how they are written in the data set


In [5]:
# note that "South Korea" => "Korea, South"

countries = sort([
  "China", "India", "Canada", "US", "United Kingdom", "Oman", "Germany"
])

country_data = case_data[map(cur_country -> cur_country in countries, case_data.country), :]


Unnamed: 0_level_0,province,country,Lat,Long,1/22/20,1/23/20,1/24/20
Unnamed: 0_level_1,String⍰,String,Float64,Float64,Int64,Int64,Int64
1,Alberta,Canada,53.9333,-116.576,0,0,0
2,British Columbia,Canada,49.2827,-123.121,0,0,0
3,Grand Princess,Canada,37.6489,-122.665,0,0,0
4,Manitoba,Canada,53.7609,-98.8139,0,0,0
5,New Brunswick,Canada,46.5653,-66.4619,0,0,0
6,Newfoundland and Labrador,Canada,53.1355,-57.6604,0,0,0
7,Nova Scotia,Canada,44.682,-63.7443,0,0,0
8,Ontario,Canada,51.2538,-85.3232,0,0,0
9,Prince Edward Island,Canada,46.5107,-63.4168,0,0,0
10,Quebec,Canada,52.9399,-73.5491,0,0,0


In [6]:
# note that France and UK include their colonies

scatter(
  country_data.Long, country_data.Lat, group=country_data.country, 
  legend=:bottomleft, xlims=(-180,+180), ylim=(-60,+60),
  xlabel="Longitude", ylabel="Latitude", title="Province Locations"
)

-----

### 3. Define a variable `num_days` by extracting the number of days of data from the dataframe.


In [7]:
using Dates

In [8]:
@show column_names = names(country_data)
@show day_columns = column_names[count.("/", map(string, column_names)) .== 2]

@show cur_dates = Date.(map(string, day_columns), DateFormat("m/d/y")) + Dates.Year(2000);
@show num_days = length(cur_dates);

column_names = names(country_data) = [:province, :country, :Lat, :Long, Symbol("1/22/20"), Symbol("1/23/20"), Symbol("1/24/20"), Symbol("1/25/20"), Symbol("1/26/20"), Symbol("1/27/20"), Symbol("1/28/20"), Symbol("1/29/20"), Symbol("1/30/20"), Symbol("1/31/20"), Symbol("2/1/20"), Symbol("2/2/20"), Symbol("2/3/20"), Symbol("2/4/20"), Symbol("2/5/20"), Symbol("2/6/20"), Symbol("2/7/20"), Symbol("2/8/20"), Symbol("2/9/20"), Symbol("2/10/20"), Symbol("2/11/20"), Symbol("2/12/20"), Symbol("2/13/20"), Symbol("2/14/20"), Symbol("2/15/20"), Symbol("2/16/20"), Symbol("2/17/20"), Symbol("2/18/20"), Symbol("2/19/20"), Symbol("2/20/20"), Symbol("2/21/20"), Symbol("2/22/20"), Symbol("2/23/20"), Symbol("2/24/20"), Symbol("2/25/20"), Symbol("2/26/20"), Symbol("2/27/20"), Symbol("2/28/20"), Symbol("2/29/20"), Symbol("3/1/20"), Symbol("3/2/20"), Symbol("3/3/20"), Symbol("3/4/20"), Symbol("3/5/20"), Symbol("3/6/20"), Symbol("3/7/20"), Symbol("3/8/20"), Symbol("3/9/20"), Symbol("3/10/20"), Symbol("3/11/20

-----

### 4. We need to accumulate the data for those places that are split up into territories. 

+ Make a zero vector of the correct length for each country, e.g. using the function `zeros`.

+ Loop through all the countries and add the corresponding data to that country's data.

+ You may use a dictionary (`Dict`), or a matrix, or a `Vector` containing `Vector`s, or a new `DataFrame` to store the data.


In [9]:
# data frame approach

@time aggregate_df = aggregate(
  groupby(
    country_data[:, [:country, day_columns...]], 
    :country
  ), 
  sum
)

# dict of lists approach

cur_dict = OrderedDict()
@time for country in countries
  cur_dict[country] = sum.(
    eachcol(country_data[country_data.country .== country, day_columns])
  )
end
@show cur_dict

# matrix approach

cur_matrix = zeros(Int, length(countries), num_days)
@time for (i, country) in enumerate(countries)
  sub_data = country_data[country_data.country .== country, day_columns]
  for cur_row in eachrow(sub_data), j ∈ num_days
      cur_matrix[i,j] += cur_row[j]
  end
end
@show cur_matrix

 32.639094 seconds (9.57 M allocations: 502.764 MiB, 2.91% gc time)
  4.471376 seconds (1.19 M allocations: 59.453 MiB)
cur_dict = OrderedDict{Any,Any}("Canada" => [0, 0, 0, 0, 1, 1, 2, 2, 2, 4, 4, 4, 4, 4, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 11, 11, 13, 14, 20, 24, 27, 30, 33, 37, 49, 54, 64, 77, 79, 108, 117, 193, 198, 252, 415, 478, 657, 800, 943, 1277, 1469, 2088, 2790, 3251, 4042, 4682, 5576, 6280, 7398, 8527, 9560, 11284, 12437, 12978, 15756, 16563, 17872, 19141, 20654, 22059, 23316, 24299, 25680, 27035, 28209, 30809, 32814, 34356, 35633, 37658, 39402, 41663, 43299, 44919, 46371, 48033, 49616, 51150, 52865, 54457],"China" => [548, 643, 920, 1406, 2075, 2877, 5509, 6087, 8141, 9802, 11891, 16630, 19716, 23707, 27440, 30587, 34110, 36814, 39829, 42354, 44386, 44759, 59895, 66358, 68413, 70513, 72434, 74211, 74619, 75077, 75550, 77001, 77022, 77241, 77754, 78166, 78600, 78928, 79356, 79932, 80136, 80261, 80386, 80537, 80690, 80770, 80823, 80860, 80887, 80921

7×100 Array{Int64,2}:
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0    54457
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0    83956
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0   163009
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0    34863
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0     2348
 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  1069424
 0  0  0  0  0  0  0  0  0  0  0  0  0     0  0  0  0  0  0  0  0  0   172481

-----

### 5. Plot all countries' data on the same graph by using a `for` loop.


Then run the loop, adding in the data using `plot!`

Finally, display the plot by evaluating the plot object `p`. (Just type its name and evaluate.)


In [None]:
p = plot(legend=:topleft, title="Cumulative Cases")

for (country, cases) in cur_dict
  plot!(cur_dates, replace(cases, 0 => NaN), label=country)
end

p

-----

### 6. Now use a `log` scale on the $y$ axis. 

In order to do so you will:

+ need to convert the vectors to contain `Float64` 
+ replace any 0 values by `NaN` ("not a number") so that `Plots.jl` ignores those values


In [None]:
p = plot(legend=:bottomright, title="Log Cumulative Cases")

for (country, cases) in cur_dict
  plot!(cur_dates, replace(cases, 0 => NaN), label=country, yscale=:log10)
end

p

*Q: Is there exponential growth?*

***A: Yes there is exponential growth. After reaching 10 cases, most curves are approximately linear (or sigmoidal for South Korea)***

<br>

-----

### 7. Turn this into an interactive visualization by adding a slider corresponding to the current day

Varying the day between 1 and the total number of days for which you have data.

+ You should draw only the data up to that particular day. 
+ As you move the slider the plot should update. 
+ Fix the horizontal axis across slider changes

In [None]:
gr()

In [None]:
function custom_plot(yscale, cur_day)
  p = plot(legend=:bottomright, title="Cumulative Cases")

  for (country, cases) in cur_dict
    plot!(cur_dates[1:cur_day], replace(cases, 0 => NaN)[1:cur_day], label=country, markershape=:circle)
  end
  
#   xlims!(map(Dates.value, (cur_dates[1], cur_dates[end])))
  ylims!(1, 2*maximum(Base.Iterators.flatten(values(cur_dict))))
  
  plot!(yscale=yscale)
end

In [None]:
using Interact

@manipulate for yscale=[:log10, :identity], cur_day in 1:num_days
  custom_plot(yscale, cur_day)
end


----

## Part 2: Visualizing changes

Now let's try to reproduce the essence of the nice visualization from https://aatishb.com/covidtrends, which is a less usual point of view. 

+ Again the slider will represent a day during the epidemic.
+ The horizontal axis will show the *total* confirmed cases until the given day
+ While the vertical axis will show the *change* in confirmed cases during the past week (7 days).


In [None]:
function make_widget(cur_day, widget_dict=cur_dict, add_annotation=false)
  p = plot(
    legend=:topleft, xlabel="Total Cases", ylabel="Recent Cases (in past week)", 
    title="COVID-19 Case Trajectory ($(day_columns[cur_day]))",
    xscale=:log10, yscale=:log10
  )
  
  lims_dict = Dict(
    "min_x" => +Inf, "max_x" => -Inf,
    "min_y" => +Inf, "max_y" => -Inf,
  )
  
  for (cur_index, (country, cases)) in enumerate(deepcopy(widget_dict))
    cases = Array{AbstractFloat}(cases)
    
    recent_cases = make_recent_cases(cases)
    update_limits!(lims_dict, cases, recent_cases)

    cases = cases[1:cur_day]
    recent_cases =  recent_cases[1:cur_day]
    
    remove_small_values!(cases, recent_cases)
    isempty(cases) && continue

    plot!(cases, recent_cases, label=country, color=cur_index)
    
    add_annotation && add_annotations!(
      country, cur_index, cases[end], recent_cases[end]
    )
  end
  
#   xlims!(lims_dict["min_x"]/2, lims_dict["max_x"]*2)
  ylims!(lims_dict["min_y"]/2, lims_dict["max_y"]*2)
end


In [None]:
function make_recent_cases(cases, delay=7) 
  return cases - [ zeros(Int, delay)..., cases[1:end-delay]... ]
end

function update_limits!(lims_dict, cases, recent_cases)
  for (tmp_key, tmp_cases) in Dict( :x => cases, :y => recent_cases )
    lims_dict["max_$(tmp_key)"] = max(lims_dict["max_$(tmp_key)"], maximum(tmp_cases))

    filter_cases = filter(!iszero, tmp_cases)
    if !isempty(filter_cases)
      lims_dict["min_$(tmp_key)"] = min(lims_dict["min_$(tmp_key)"], minimum(filter_cases)) 
    end
  end
end

function remove_small_values!(cases, recent_cases, min_cases=8, min_recent_cases=4)
  del_indices = []
  
  for (cur_cases, cur_threshold) in (cases => min_cases, recent_cases => min_recent_cases)
    append!(del_indices, findall(cur_case -> cur_case <= cur_threshold, cur_cases))
  end
  
  unique!(del_indices)
  sort!(del_indices)

  deleteat!(cases, del_indices)
  deleteat!(recent_cases, del_indices)
end


In [None]:
@manipulate for cur_day in 1:num_days
  make_widget(cur_day)
end


----

### 1. Make a data set `total_cases`

Representing for each day the *total* number of confirmed cases during the whole epidemic up until that point


In [None]:
total_cases = sum.(eachcol(case_data[:, day_columns]));
# total_cases = sum.(eachcol(country_data[:, day_columns]));


----

### 2. Make a set of data `new_cases` which is the total number of cases only during the past 7 days.


In [None]:
new_cases = total_cases - [ zeros(Int, 7)..., total_cases[1:end-7]... ];
@assert make_recent_cases(total_cases) == new_cases

@assert true
# @assert false    # raises AssertionError


----

### 3. Make the visualization using a slider representing days as in the previous exercise

Plotting the total number of cases on the $x$ axis and new cases on the $y$ axis.

You need to take care that each vector being plotted has the same length.


In [None]:
all_dict = OrderedDict([
  ("All" => total_cases), 
  cur_dict...
])

@manipulate for cur_day in 1:num_days
  make_widget(cur_day, all_dict)
end


----

### 4. Add a dot for each countries current position and annotate


In [None]:
# already used inside make_widget function

function add_annotations!(country, cur_index, cases_end, recent_cases_end)
  scatter!([cases_end], [recent_cases_end], label="", color=cur_index, markersize=2.5)
  annotate!(cases_end, recent_cases_end, text(country, 9, :grey, :left))
end


In [None]:
@manipulate for cur_day in 1:num_days
  add_annotations = true
  
  make_widget(cur_day, all_dict, add_annotations)
end


----

### 5. Make a GIF version of the plot from before



In [None]:
animation = @animate for cur_day in 1:num_days
  make_widget(cur_day, [("All" => total_cases), cur_dict...], true)
end

gif(animation, fps = 6)
