In [35]:
# Import libraries

import pandas as pd
import numpy as np
from tabulate import tabulate

In [36]:
# Read csv from the scraped data and turn it into a dataframe
df = pd.read_csv('CovidOutput.csv')

In [37]:
df.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Continent
0,North America,689454,810,30455,78.0,60735,598264,14599,,,,,North America
1,Europe,982552,11795,89531,1212.0,268100,624921,28629,,,,,Europe
2,Asia,338412,5215,12257,189.0,163620,162535,6407,,,,,Asia
3,South America,63764,462,2804,7.0,22427,38533,1218,,,,,South America
4,Oceania,7961,37,72,,4518,3371,71,,,,,Australia/Oceania


In [38]:
# Check any null values
df.isnull().any().any()

True

In [39]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 13 columns):
Country,Other       227 non-null object
TotalCases          228 non-null object
NewCases            94 non-null object
TotalDeaths         180 non-null object
NewDeaths           55 non-null object
TotalRecovered      205 non-null object
ActiveCases         228 non-null object
Serious,Critical    148 non-null object
Tot Cases/1M pop    212 non-null object
Deaths/1M pop       164 non-null object
TotalTests          162 non-null object
Tests/
1M pop       162 non-null object
Continent           224 non-null object
dtypes: object(13)
memory usage: 23.2+ KB


In [40]:
type(df)

pandas.core.frame.DataFrame

In [41]:
df.shape

(228, 13)

In [42]:
# Drop any unnecessary information by choosing only the desired columns

new_df = df.loc[1:212, ['Country,Other', 'TotalCases', 'TotalRecovered', 'TotalDeaths', 'Continent']]

# Rename the columns name

new_df.rename(columns = {'Country,Other':'Country', 'TotalCases':'Total Cases', 'TotalRecovered':'Total Recovered', 'TotalDeaths':'Total Deaths'}, inplace=True)
new_df.head()

Unnamed: 0,Country,Total Cases,Total Recovered,Total Deaths,Continent
1,Europe,982552,268100,89531,Europe
2,Asia,338412,163620,12257,Asia
3,South America,63764,22427,2804,South America
4,Oceania,7961,4518,72,Australia/Oceania
5,Africa,18106,3889,915,Africa


In [43]:
print(new_df.describe())

       Country Total Cases Total Recovered Total Deaths Continent
count      211         212             193          170       209
unique     211         180             142           90         7
top      Macao          16               2            1    Africa
freq         1           5               8           17        56


In [44]:
# Check for any missing values in the new dataframe

new_df.isna()
new_df.isna().sum()

Country             1
Total Cases         0
Total Recovered    19
Total Deaths       42
Continent           3
dtype: int64

In [45]:
# Replace the null values in Total Recovered & Total Deaths columns

new_df[['Total Recovered', 'Total Deaths']] = new_df[['Total Recovered','Total Deaths']].fillna(value=0)

In [46]:
new_df.tail()

Unnamed: 0,Country,Total Cases,Total Recovered,Total Deaths,Continent
208,Western Sahara,6,0,0,Africa
209,Burundi,5,0,1,Africa
210,Bhutan,5,2,0,Asia
211,Sao Tome and Principe,4,0,0,Africa
212,South Sudan,4,0,0,Africa


In [47]:
# Drop any missing value in Country & Continent columns

new_df = new_df.dropna(how='any', subset=['Country', 'Continent'])

In [49]:
new_df.tail()

Unnamed: 0,Country,Total Cases,Total Recovered,Total Deaths,Continent
208,Western Sahara,6,0,0,Africa
209,Burundi,5,0,1,Africa
210,Bhutan,5,2,0,Asia
211,Sao Tome and Principe,4,0,0,Africa
212,South Sudan,4,0,0,Africa


In [53]:
# Reset the index after dropping the rows

new_df = new_df.reset_index(drop=True)
new_df.tail()

Unnamed: 0,Country,Total Cases,Total Recovered,Total Deaths,Continent
204,Western Sahara,6,0,0,Africa
205,Burundi,5,0,1,Africa
206,Bhutan,5,2,0,Asia
207,Sao Tome and Principe,4,0,0,Africa
208,South Sudan,4,0,0,Africa


In [54]:
# Re-checking for any null values

new_df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 5 columns):
Country            209 non-null object
Total Cases        209 non-null object
Total Recovered    209 non-null object
Total Deaths       209 non-null object
Continent          209 non-null object
dtypes: object(5)
memory usage: 8.2+ KB


In [55]:
# Tabulate the data

print(tabulate(new_df, headers='keys', tablefmt='psql'))

+-----+------------------------+---------------+-------------------+----------------+-------------------+
|     | Country                | Total Cases   | Total Recovered   | Total Deaths   | Continent         |
|-----+------------------------+---------------+-------------------+----------------+-------------------|
|   0 | Europe                 | 982,552       | 268,100           | 89,531         | Europe            |
|   1 | Asia                   | 338,412       | 163,620           | 12,257         | Asia              |
|   2 | South America          | 63,764        | 22,427            | 2,804          | South America     |
|   3 | Oceania                | 7,961         | 4,518             | 72             | Australia/Oceania |
|   4 | Africa                 | 18,106        | 3,889             | 915            | Africa            |
|   5 | World                  | 2,100,970     | 523,933           | 136,048        | All               |
|   6 | USA                    | 644,417      