# COVID-19: Data Analyzation and Visualization using the COVID-19 Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University

## Imports and Data

### Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import time
from datetime import datetime

%matplotlib inline

### Importing datasets as pandas dataframes

In [2]:
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
covid_19_confirmed_cases_data = base_url + 'time_series_covid19_confirmed_global.csv'
covid_19_death_cases_data = base_url + 'time_series_covid19_deaths_global.csv'
covid_19_recovered_cases_data = base_url+ 'time_series_covid19_recovered_global.csv'

In [3]:
global_covid_19_confirmed = pd.read_csv(covid_19_confirmed_cases_data)
global_covid_19_deaths = pd.read_csv(covid_19_death_cases_data)
global_covid_19_recovered = pd.read_csv(covid_19_recovered_cases_data)

Check to see if the data has been imported correctly by looking at the head of the dataframe

In [4]:
global_covid_19_confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,10/31/20,11/1/20,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20,11/9/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,41425,41501,41633,41728,41814,41935,41975,42033,42092,42297
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,20875,21202,21523,21904,22300,22721,23210,23705,24206,24731
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,57942,58272,58574,58979,59527,60169,60800,61381,62051,62693
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,4756,4825,4888,4910,5045,5135,5135,5319,5383,5437
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,10805,11035,11228,11577,11813,12102,12223,12335,12433,12680


## Data Cleaning

When looking at the data, it is important to note that some rows in the .csv files are data for a particular province or state of a country or region as seen with Australia.

In [5]:
is_australia = global_covid_19_confirmed['Country/Region']=='Australia'
australia = global_covid_19_confirmed[is_australia]
australia

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,10/31/20,11/1/20,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20,11/9/20
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,114,114,114,114,114,114,114,114,114,114
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,4425,4432,4435,4443,4445,4454,4459,4462,4469,4469
10,Northern Territory,Australia,-12.4634,130.8456,0,0,0,0,0,0,...,38,38,38,38,39,39,39,39,39,40
11,Queensland,Australia,-27.4698,153.0251,0,0,0,0,0,0,...,1172,1172,1175,1177,1177,1177,1177,1177,1177,1178
12,South Australia,Australia,-34.9285,138.6007,0,0,0,0,0,0,...,501,501,503,504,509,510,512,515,515,517
13,Tasmania,Australia,-42.8821,147.3272,0,0,0,0,0,0,...,230,230,230,230,230,230,230,230,230,230
14,Victoria,Australia,-37.8136,144.9631,0,0,0,0,1,1,...,20346,20345,20345,20345,20345,20345,20345,20345,20345,20345
15,Western Australia,Australia,-31.9505,115.8605,0,0,0,0,0,0,...,769,769,770,771,771,775,776,776,776,776


As well, the indexes are strings as shown by the dates being encompassed by ''s

In [6]:
global_covid_19_confirmed.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '10/31/20', '11/1/20', '11/2/20', '11/3/20', '11/4/20', '11/5/20',
       '11/6/20', '11/7/20', '11/8/20', '11/9/20'],
      dtype='object', length=297)

Taking in the information, here is how I will clean the data:
* Since I want the numbers for each country, I want to add up all the numbers of each province/state row that corresponds to the country. To do this, I used the `.groupby()` function.
* Summing up the Lat and Long would not make sense as that is not how latitude and longitude work. I want to remove the two columns which can be done by using the `.drop()` function.
* I want to turn the dates that are currently strings into DateTime object. This can be done by using `.to_datetime()`.

Below is the function that includes all the points listed above.

In [7]:
def clean_data(data):
    data = data.groupby(['Country/Region']).sum().drop(['Lat', 'Long'], axis=1)
    data.columns = pd.to_datetime(data.columns)
    return data

In [8]:
global_covid_19_confirmed = clean_data(global_covid_19_confirmed)
global_covid_19_deaths = clean_data(global_covid_19_deaths)
global_covid_19_recovered = clean_data(global_covid_19_recovered)

After cleaning the data, this is how all the .csv files look.

In [9]:
global_covid_19_confirmed.head()

Unnamed: 0_level_0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-31,2020-11-01,2020-11-02,2020-11-03,2020-11-04,2020-11-05,2020-11-06,2020-11-07,2020-11-08,2020-11-09
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,41425,41501,41633,41728,41814,41935,41975,42033,42092,42297
Albania,0,0,0,0,0,0,0,0,0,0,...,20875,21202,21523,21904,22300,22721,23210,23705,24206,24731
Algeria,0,0,0,0,0,0,0,0,0,0,...,57942,58272,58574,58979,59527,60169,60800,61381,62051,62693
Andorra,0,0,0,0,0,0,0,0,0,0,...,4756,4825,4888,4910,5045,5135,5135,5319,5383,5437
Angola,0,0,0,0,0,0,0,0,0,0,...,10805,11035,11228,11577,11813,12102,12223,12335,12433,12680


In [10]:
global_covid_19_deaths.head()

Unnamed: 0_level_0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-31,2020-11-01,2020-11-02,2020-11-03,2020-11-04,2020-11-05,2020-11-06,2020-11-07,2020-11-08,2020-11-09
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,1536,1536,1541,1544,1548,1554,1554,1556,1558,1574
Albania,0,0,0,0,0,0,0,0,0,0,...,509,518,527,532,536,543,549,557,559,571
Algeria,0,0,0,0,0,0,0,0,0,0,...,1964,1973,1980,1980,1999,2011,2024,2036,2048,2062
Andorra,0,0,0,0,0,0,0,0,0,0,...,75,75,75,75,75,75,75,75,75,75
Angola,0,0,0,0,0,0,0,0,0,0,...,284,286,289,291,296,299,300,303,307,308


In [11]:
global_covid_19_recovered.head()

Unnamed: 0_level_0,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-10-31,2020-11-01,2020-11-02,2020-11-03,2020-11-04,2020-11-05,2020-11-06,2020-11-07,2020-11-08,2020-11-09
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,34321,34326,34342,34355,34362,34440,34440,34446,34458,34721
Albania,0,0,0,0,0,0,0,0,0,0,...,11189,11246,11367,11473,11578,11696,11861,12002,12092,12203
Algeria,0,0,0,0,0,0,0,0,0,0,...,40201,40395,40577,40577,41001,41244,41510,41783,42037,42325
Andorra,0,0,0,0,0,0,0,0,0,0,...,3475,3475,3548,3627,3734,3858,3858,4043,4248,4332
Angola,0,0,0,0,0,0,0,0,0,0,...,4523,4920,5172,5230,5266,5350,5626,5647,5899,5927


By using the `.groupby()` function, all the numbers of a Provinces/States in a particular country is added up. This is can be shown with the data of Australia. 

As seen in `2020-01-26`, the confirmed cases in `New South Wales` (3) and `Victoria` (1) are added together to get 4. 

In [12]:
global_covid_19_confirmed.loc['Australia']

2020-01-22        0
2020-01-23        0
2020-01-24        0
2020-01-25        0
2020-01-26        4
              ...  
2020-11-05    27644
2020-11-06    27652
2020-11-07    27658
2020-11-08    27665
2020-11-09    27669
Name: Australia, Length: 293, dtype: int64

## Data Visualization

### Preparation

Now it is time to create some visualizations of the data. First, I would want to check what the dataset is indexed by by using `.index`.

In [13]:
global_covid_19_confirmed.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Kingdom', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam',
       'West Bank and Gaza', 'Western Sahara', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Country/Region', length=190)

In [14]:
global_covid_19_deaths.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Kingdom', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam',
       'West Bank and Gaza', 'Western Sahara', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Country/Region', length=190)

In [15]:
global_covid_19_recovered.index

Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',
       ...
       'United Kingdom', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam',
       'West Bank and Gaza', 'Western Sahara', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Country/Region', length=190)

Currently, all the data is indexed by `Country/Region`. To make producing visualizations easier, I would want to index by the date instead. I would do this by taking the transpose of the dataframes with `.transpose()`.

In [16]:
global_covid_19_confirmed = global_covid_19_confirmed.transpose()
global_covid_19_deaths = global_covid_19_deaths.transpose()
global_covid_19_recovered = global_covid_19_recovered.transpose()

In [17]:
global_covid_19_confirmed.head()

Country/Region,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
2020-01-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
2020-01-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
2020-01-26,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,2,0,0,0,0,0


In [18]:
global_covid_19_deaths.head()

Country/Region,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
global_covid_19_recovered.head()

Country/Region,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


To confirm, I will list out the indexes again.

In [20]:
global_covid_19_confirmed.index

DatetimeIndex(['2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25',
               '2020-01-26', '2020-01-27', '2020-01-28', '2020-01-29',
               '2020-01-30', '2020-01-31',
               ...
               '2020-10-31', '2020-11-01', '2020-11-02', '2020-11-03',
               '2020-11-04', '2020-11-05', '2020-11-06', '2020-11-07',
               '2020-11-08', '2020-11-09'],
              dtype='datetime64[ns]', length=293, freq=None)

I will also look at the country/region names to see if they are spelt correctly.

In [21]:
list(global_covid_19_confirmed.columns)

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',

By reviewing the column names, I want to change `Korea, South` -> `South Korea`, `Taiwan*` -> `Taiwan`, and `US` -> `United States of America`. 

In [22]:
global_covid_19_confirmed = global_covid_19_confirmed.rename(columns={'Taiwan*' : 'Taiwan', 'Korea, South' : 'South Korea', 'US' : 'United States of America'}).sort_index(axis=1)
global_covid_19_deaths = global_covid_19_deaths.rename(columns={'Taiwan*' : 'Taiwan', 'Korea, South' : 'South Korea', 'US' : 'United States of America'}).sort_index(axis=1)
global_covid_19_recovered = global_covid_19_recovered.rename(columns={'Taiwan*' : 'Taiwan', 'Korea, South' : 'South Korea', 'US' : 'United States of America'}).sort_index(axis=1)

In [23]:
all_countries_regions = list(global_covid_19_confirmed.columns)

Now that I finished cleaning the data and manipulating the dataframes to make it easier for visualization, I will combine the dataframes into one variable.

In [24]:
global_covid_19_dataset = [(global_covid_19_confirmed, 'Confirmed Cases Reported'), (global_covid_19_deaths, 'Deaths Reported'), (global_covid_19_recovered, 'Recovered Cases Reported')]

### Plotting Data

#### By Country

First, I would like to prepare graphs for each country in the dataset. I will be using `matplotlib` to help. 

In [25]:
def by_country_region(country_region, dataframe, label):
    graph = dataframe[country_region].plot(label=label);
    
    if dataframe.equals(global_covid_19_confirmed):
        series = 'Confirmed Cases'
    elif dataframe.equals(global_covid_19_deaths):
        series = 'Recovered Cases'
    elif dataframe.equals(global_covid_19_recovered):
        series = 'Deaths'
    else:
        raise AssertionError('No Dataframe is found')
    
    return graph

In [26]:
def save_by_country_region():
    for country_region in all_countries_regions:
        for dataframe in global_covid_19_dataset:
            graph = by_country_region(country_region, dataframe[0], label=dataframe[1])
        graph.set_xlabel('Date')
        graph.set_ylabel('Cases Reported') 
        graph.set_title('Trends in COVID-19 in ' + country_region)
        lgd = plt.legend(loc="lower center", bbox_to_anchor=(0.5, -0.5), fancybox=True, shadow=True)
        plt.savefig("reported_cases_by_country/" + country_region + ".png", bbox_extra_artists=(lgd,), bbox_inches='tight')
        plt.clf()

In [27]:
save_by_country_region()

<Figure size 432x288 with 0 Axes>