In [14]:
import pandas as pd
import numpy as np

In [15]:
### Loading the data
df = pd.read_csv("owid-covid-data.csv")
df.head()

### We also want to make sure we fix the date column.
df["date"] = pd.to_datetime(df["date"])

df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [16]:
### We noticed that there are some rows where the continent and the location are the same which deters us from
### analyzing the correct data. ISO_codes with OWID abbreviations have the data of the entire continent so we cannot use
### that

### So what we do is clean the data. 
countries_df = df.loc[df["continent"] != df["location"]]
countries_df.head()

countries_df = countries_df[["iso_code", "continent", "location", "date", "new_cases", "new_deaths", "total_cases", "total_deaths", "reproduction_rate", "hosp_patients", "new_tests", "positive_rate", "new_vaccinations", "population", "median_age", "hospital_beds_per_thousand" ]]
countries_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
0,AFG,Asia,Afghanistan,2020-02-24,5.0,,5.0,,,,,,,39835428.0,18.6,0.5
1,AFG,Asia,Afghanistan,2020-02-25,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
2,AFG,Asia,Afghanistan,2020-02-26,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
3,AFG,Asia,Afghanistan,2020-02-27,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
4,AFG,Asia,Afghanistan,2020-02-28,0.0,,5.0,,,,,,,39835428.0,18.6,0.5


In [17]:
### In order to clean this data, we didn't want to drop the null so we decided to replace it with zero instead.
countries_df.replace(np.nan, 0, inplace = True)
countries_df.head()



Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
0,AFG,Asia,Afghanistan,2020-02-24,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
1,AFG,Asia,Afghanistan,2020-02-25,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
2,AFG,Asia,Afghanistan,2020-02-26,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
3,AFG,Asia,Afghanistan,2020-02-27,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
4,AFG,Asia,Afghanistan,2020-02-28,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5


In [20]:
#Export the clean data
countries_df.to_csv("cleaned_data.csv", index = False)

In [21]:
# For our tableau Visualization we will do the analysis based on the continent only
# Actually, in our dataset the data is also provided by continent
# If the location is the continent the continent column is equal to Zero
# So, we will only keep the row with continent equal to Zero


countries_df =countries_df[countries_df['continent'] == 0]
countries_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
686,OWID_AFR,0,Africa,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
687,OWID_AFR,0,Africa,2020-02-14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
688,OWID_AFR,0,Africa,2020-02-15,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
689,OWID_AFR,0,Africa,2020-02-16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
690,OWID_AFR,0,Africa,2020-02-17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0


In [34]:
### Selection the continent we want to Analyse (America is divided in North America and South America)
continents = "Africa", "Asia", "Europe","North America", "Oceania", "South America"
continents_df = countries_df[countries_df['location'].isin(continents)]
continents_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
686,OWID_AFR,0,Africa,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
687,OWID_AFR,0,Africa,2020-02-14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
688,OWID_AFR,0,Africa,2020-02-15,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
689,OWID_AFR,0,Africa,2020-02-16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
690,OWID_AFR,0,Africa,2020-02-17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0


In [36]:
# Export the data
continents_df.to_csv("continent_cleaned_data.csv", index = False)