<a href="https://colab.research.google.com/github/bubblebolt/dads/blob/main/DADS5001/ASM3-Plotly/6610412002_Plotly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COVID-19 Coronavirus Pandemic



*   [Total cases of COVID 19 around the world](https://www.kaggle.com/datasets/rinichristy/covid19-coronavirus-pandemic/data)
*   By Chalita Iamleelaporn ID : 6610412002




In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/bubblebolt/dads/main/DADS5001/ASM3-Plotly/COVID-19%20Coronavirus.csv')
data.head()

Unnamed: 0,Country,Other names,ISO 3166-1 alpha-3 CODE,Population,Continent,Total Cases,Total Deaths,Tot Cases//1M pop,Tot Deaths/1M pop,Death percentage
0,Afghanistan,Afghanistan,AFG,40462186,Asia,177827,7671,4395,190,4.313743
1,Albania,Albania,ALB,2872296,Europe,273870,3492,95349,1216,1.275058
2,Algeria,Algeria,DZA,45236699,Africa,265691,6874,5873,152,2.587216
3,Andorra,Andorra,AND,77481,Europe,40024,153,516565,1975,0.382271
4,Angola,Angola,AGO,34654212,Africa,99194,1900,2862,55,1.915438


## Data Explorer

In [3]:
# Data dimensions
data.shape

(225, 10)

In [4]:
# Information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Country                  225 non-null    object 
 1   Other names              224 non-null    object 
 2   ISO 3166-1 alpha-3 CODE  225 non-null    object 
 3   Population               225 non-null    int64  
 4   Continent                225 non-null    object 
 5   Total Cases              225 non-null    int64  
 6   Total Deaths             225 non-null    int64  
 7   Tot Cases//1M pop        225 non-null    int64  
 8   Tot Deaths/1M pop        225 non-null    int64  
 9   Death percentage         225 non-null    float64
dtypes: float64(1), int64(5), object(4)
memory usage: 17.7+ KB


In [5]:
# Get a statistical summary of the numerical variables in the data
data.describe()

Unnamed: 0,Population,Total Cases,Total Deaths,Tot Cases//1M pop,Tot Deaths/1M pop,Death percentage
count,225.0,225.0,225.0,225.0,225.0,225.0
mean,35073210.0,2184781.0,27448.13,136900.373333,1096.715556,1.444125
std,139241800.0,7275938.0,96891.77,145060.340289,1195.715543,1.741728
min,805.0,1.0,0.0,9.0,0.0,0.0
25%,566557.0,24071.0,189.0,11384.0,123.0,0.511291
50%,5827911.0,163936.0,1965.0,88987.0,708.0,1.036905
75%,21905850.0,1092547.0,13660.0,223335.0,1795.0,1.977017
max,1439324000.0,81839050.0,1008222.0,696044.0,6286.0,18.151787


In [6]:
# Check if there are missing values
data.isna().sum()

Country                    0
Other names                1
ISO 3166-1 alpha-3 CODE    0
Population                 0
Continent                  0
Total Cases                0
Total Deaths               0
Tot Cases//1M pop          0
Tot Deaths/1M pop          0
Death percentage           0
dtype: int64

In [7]:
data[data['Other names'].isnull()]

Unnamed: 0,Country,Other names,ISO 3166-1 alpha-3 CODE,Population,Continent,Total Cases,Total Deaths,Tot Cases//1M pop,Tot Deaths/1M pop,Death percentage
135,Montenegro,,MNE,628205,Europe,233326,2705,371417,4306,1.159322


## Data Cleaning¶


In [8]:
df = data.copy()
df.columns

Index(['Country', 'Other names', 'ISO 3166-1 alpha-3 CODE', 'Population',
       'Continent', 'Total Cases', 'Total Deaths', 'Tot Cases//1M pop',
       'Tot Deaths/1M pop', 'Death percentage'],
      dtype='object')

In [9]:
# Convert columns names to lowercase and replace spaces with underscores
df.rename(columns = {'Tot\xa0Cases//1M pop' : 'Total Cases per 1M pop',
                    'Tot\xa0Deaths/1M pop': 'Total Deaths per 1M pop'}, inplace=True)
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns = df.columns.str.replace('/', '_')
df.columns

Index(['country', 'other_names', 'iso_3166-1_alpha-3_code', 'population',
       'continent', 'total_cases', 'total_deaths', 'total_cases_per_1m_pop',
       'total_deaths_per_1m_pop', 'death_percentage'],
      dtype='object')

## Data Analysis

In [10]:
# Total cases and deaths per continent
total_continent = df.groupby(['continent'])[['total_cases', 'total_deaths']].sum().reset_index()
total_continent.sort_values(by='total_cases', ascending=False)

Unnamed: 0,continent,total_cases,total_deaths
2,Europe,180332483,1775727
1,Asia,140957179,1405003
4,Northern America,85364770,1046062
3,Latin America and the Caribbean,67509231,1686828
0,Africa,11764207,252873
5,Oceania,5647957,9336


In [11]:
# Total cases and deaths per million population per continent
per1m_continent = df.groupby(['continent'])[['total_cases_per_1m_pop', 'total_deaths_per_1m_pop']].sum().reset_index()
per1m_continent.sort_values(by='total_cases_per_1m_pop', ascending=False,inplace=True)
per1m_continent

Unnamed: 0,continent,total_cases_per_1m_pop,total_deaths_per_1m_pop
2,Europe,14596188,110071
3,Latin America and the Caribbean,6943019,75380
1,Asia,5015344,30847
0,Africa,1782060,17843
5,Oceania,1375938,6009
4,Northern America,1090035,6611


In [12]:
# Filter for all records from Asia
Asia = df[df['continent'] == 'Asia']
Asia_per_country = Asia.groupby(['country'])[['total_cases_per_1m_pop','total_deaths_per_1m_pop']].sum()
Asia_per_country.sort_values(by = 'total_cases_per_1m_pop', ascending=False, inplace=True)
Asia_per_country.head()

Unnamed: 0_level_0,total_cases_per_1m_pop,total_deaths_per_1m_pop
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Israel,422813,1129
Georgia,414819,4215
Cyprus,359817,774
Maldives,317645,535
Bahrain,308168,815


In [13]:
#Find Top 10 Countries With Most deaths per 1 millons population
df.sort_values(by='total_deaths_per_1m_pop',ascending=False ).head(10)

Unnamed: 0,country,other_names,iso_3166-1_alpha-3_code,population,continent,total_cases,total_deaths,total_cases_per_1m_pop,total_deaths_per_1m_pop,death_percentage
158,Peru,Peru,PER,33775745,Latin America and the Caribbean,3548559,212328,105062,6286,5.983499
29,Bulgaria,Bulgaria,BGR,6856886,Europe,1140679,36568,166355,5333,3.20581
24,Bosnia and Herzegovina,Bosnia and Herzegovina,BIH,3245097,Europe,375693,15719,115773,4844,4.184001
90,Hungary,Hungary,HUN,9617409,Europe,1854198,45510,192796,4732,2.45443
149,North Macedonia,The former Yugoslav Republic of Macedonia,MKD,2083224,Europe,306670,9228,147209,4430,3.009098
135,Montenegro,,MNE,628205,Europe,233326,2705,371417,4306,1.159322
75,Georgia,Georgia,GEO,3975762,Asia,1649222,16756,414819,4215,1.015994
48,Croatia,Croatia,HRV,4060951,Europe,1102730,15601,271545,3842,1.414762
52,Czechia,Czech Republic,CZE,10743762,Europe,3830631,39720,356545,3697,1.036905
183,Slovakia,Slovakia,SVK,5464272,Europe,1725487,19417,315776,3553,1.125305


## Data Visualization¶

In [14]:
import plotly.express as px
import pandas as pd

fig = px.treemap(df,
                 path=[px.Constant("world"), 'continent', 'country'],
                 values='total_cases_per_1m_pop',
                 color='total_cases_per_1m_pop',
                 hover_data=['total_deaths_per_1m_pop'],
                 color_continuous_scale='RdBu_r',
                 title='Treemap of Total Covid 19 cases per million population',
                 color_continuous_midpoint=300000
                 )


fig.update_layout(margin=dict(t=50, l=0, r=0, b=0))

fig.show()

## Observation


*   Europe has the highest and Northern America	 has of the lowest of total Covid 19 cases per million population.
*   Peru has a significantly higher death toll compared to the other countries.
*   In Asia, Israel has the highest of total Covid 19 cases per million population
*   <p>Using "per million population" data is valuable in comparative analysis as it removes bias </p> from population size differences, offering a more accurate picture of relative conditions across countries.


