In [15]:
"""
This project explores Covid-19 data from ___ and will visualize it using Ployly Express in Python. It will provide 
more than a dozen of bar charts, line graphs, bubble charts, and scatter plots. Exploration of the data will help 
people to understand likely complex scenarios unfolding to make predictions for the future. It will examine the 
impacts of conducting preventative measures and other hygenic practices. It will provide some arguements on which
health resources will best manage the outbreak. In closing, this project will summerize modeling, simulation, and 
analysis.

Tools and technology used in the project: 
    - Google Colab(Runtime type - GPU)

Requirements to Build the Project: 
    - Basic knowledge of Python
    - Basic understanding of graphs and charts
    - Data visualization
    - Pandas
    - Numpy
    - Matplotlib
    - Plotly Express
    - Choropleth
    - Wordcloud
"""

# STEP 1: IMPORT DEPENDENCIES AND LIBRARIES
# Import libraries for data analysis and manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd

# Import libraries for data visualization
import matplotlib
import matplotlib.pyplot as plt

# Importing Plotly's offline verision 
import plotly.offline as py
py.init_notebook_mode(connected=True)

In [16]:
# Initialize Plotly
pio.renderers.default = 'iframe'

In [17]:
# STEP 2: IMPORT DATASETS FOR ANALYSIS

"""
The following data is contained in the files:
1. covid.csv - contains Country/Region, Continent,  Population, TotalCases, NewCases, TotalDeaths, NewDeaths, 
                TotalRecovered, NewRecovered, ActiveCases, Serious, Critical, Tot Cases/1M pop, Deaths/1M pop,
                TotalTests, Tests/1M pop, WHO Region, iso_alpha.
2. covid_grouped - contains Date(from 20-01-22 to 20-07-27), Country/Region, Confirmed, Deaths, Recovered, Active, 
                New cases, New deaths, New recovered, WHO Region, iso_alpha.
3. coviddeath - contains real-world examples of a number of Covid-19 deaths and the reasons behind the deaths.
"""

dataset1 = pd.read_csv("./data/covid.csv")
dataset1.head()

Unnamed: 0,Country/Region,Continent,Population,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,iso_alpha
0,USA,North America,331198100.0,5032179,,162804.0,,2576668.0,,2292707.0,18296.0,15194.0,492.0,63139605.0,190640.0,Americas,USA
1,Brazil,South America,212710700.0,2917562,,98644.0,,2047660.0,,771258.0,8318.0,13716.0,464.0,13206188.0,62085.0,Americas,BRA
2,India,Asia,1381345000.0,2025409,,41638.0,,1377384.0,,606387.0,8944.0,1466.0,30.0,22149351.0,16035.0,South-EastAsia,IND
3,Russia,Europe,145940900.0,871894,,14606.0,,676357.0,,180931.0,2300.0,5974.0,100.0,29716907.0,203623.0,Europe,RUS
4,South Africa,Africa,59381570.0,538184,,9604.0,,387316.0,,141264.0,539.0,9063.0,162.0,3149807.0,53044.0,Africa,ZAF


In [18]:
# Sample the data and return the tuple of shape (rows, column)
print(dataset1.shape)

(209, 17)


In [19]:
# Return the size of dataframe
print(dataset1.size)

3553


In [20]:
# Return concise information about the dataset 
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country/Region    209 non-null    object 
 1   Continent         208 non-null    object 
 2   Population        208 non-null    float64
 3   TotalCases        209 non-null    int64  
 4   NewCases          4 non-null      float64
 5   TotalDeaths       188 non-null    float64
 6   NewDeaths         3 non-null      float64
 7   TotalRecovered    205 non-null    float64
 8   NewRecovered      3 non-null      float64
 9   ActiveCases       205 non-null    float64
 10  Serious,Critical  122 non-null    float64
 11  Tot Cases/1M pop  208 non-null    float64
 12  Deaths/1M pop     187 non-null    float64
 13  TotalTests        191 non-null    float64
 14  Tests/1M pop      191 non-null    float64
 15  WHO Region        184 non-null    object 
 16  iso_alpha         209 non-null    object 
dt

In [21]:
dataset2 = pd.read_csv("./data/covid_grouped.csv")
dataset2.head()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,WHO Region,iso_alpha
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0,Eastern Mediterranean,AFG
1,2020-01-22,Albania,0,0,0,0,0,0,0,Europe,ALB
2,2020-01-22,Algeria,0,0,0,0,0,0,0,Africa,DZA
3,2020-01-22,Andorra,0,0,0,0,0,0,0,Europe,AND
4,2020-01-22,Angola,0,0,0,0,0,0,0,Africa,AGO


In [22]:
# Sample the data and return the tuple of shape (rows, column)ab
print(dataset2.shape)

(35156, 11)


In [23]:
# Return the size of dataframe
print(dataset2.size)

386716


In [24]:
# Return concise information about the dataset
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            35156 non-null  object
 1   Country/Region  35156 non-null  object
 2   Confirmed       35156 non-null  int64 
 3   Deaths          35156 non-null  int64 
 4   Recovered       35156 non-null  int64 
 5   Active          35156 non-null  int64 
 6   New cases       35156 non-null  int64 
 7   New deaths      35156 non-null  int64 
 8   New recovered   35156 non-null  int64 
 9   WHO Region      35156 non-null  object
 10  iso_alpha       35156 non-null  object
dtypes: int64(7), object(4)
memory usage: 3.0+ MB


In [25]:
# STEP 3: CLEAN DATASETS
dataset1.columns

Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases',
       'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered',
       'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
       'TotalTests', 'Tests/1M pop', 'WHO Region', 'iso_alpha'],
      dtype='object')

In [26]:
# Drop NewCases, NewDeaths, NewRecovered columns via the drop method 
dataset1.drop(["NewCases", "NewDeaths", "NewRecovered"], axis=1, inplace=True)

In [27]:
# Sample a random set of values from the new dataset 
dataset1.sample(5)

Unnamed: 0,Country/Region,Continent,Population,TotalCases,TotalDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,WHO Region,iso_alpha
180,Bermuda,North America,62254.0,157,9.0,144.0,4.0,,2522.0,145.0,26352.0,423298.0,Americas,BMU
40,Netherlands,Europe,17138756.0,56982,6153.0,,,37.0,3325.0,359.0,1079860.0,63007.0,Europe,NLD
105,Zimbabwe,Africa,14883803.0,4339,84.0,1264.0,2991.0,,292.0,6.0,140421.0,9434.0,Africa,ZWE
55,Azerbaijan,Asia,10148243.0,33247,479.0,29275.0,3493.0,66.0,3276.0,47.0,766179.0,75499.0,Europe,AZE
149,Chad,Africa,16467965.0,942,76.0,838.0,28.0,,57.0,5.0,,,Africa,TCD


In [28]:
# With the new dataset create a table using Plotly Express to view data
from plotly.figure_factory import create_table
colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
table = create_table(dataset1.head(15), colorscale=colorscale)
table.show()

In [None]:
# STEP 4: USE A BAR GRAPH TO COMPARE DATA BY ANALYZING TOTAL CASES, TOTAL DEATHS, TOTAL RECOVERED, AND TOTAL TESTS OF COUNTRIES

"""
We will visualize the bar graph via Plotly Express. Visualization is done by moving the cursor into any plot. We view the
label by navigating with the cursor. We can view and analyze the data using the relation between the columns. We will limit
the view to the top 15 countries, color code them, and make the data viewable when the cursor hovers.
"""