In [1]:
import pandas as pd
import numpy as np

In [2]:
### Loading the data
df = pd.read_csv("owid-covid-data.csv")
df.head()

### We also want to make sure we fix the date column.
df["date"] = pd.to_datetime(df["date"])

df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [3]:
### We noticed that there are some rows where the continent and the location are the same which deters us from
### analyzing the correct data. ISO_codes with OWID abbreviations have the data of the entire continent so we cannot use
### that

### So what we do is clean the data. 
countries_df = df.loc[df["continent"] != df["location"]]
countries_df.head()

countries_df = countries_df[["iso_code", "continent", "location", "date", "new_cases", "new_deaths", "total_cases", "total_deaths", "reproduction_rate", "hosp_patients", "new_tests", "positive_rate", "new_vaccinations", "population", "median_age", "hospital_beds_per_thousand" ]]
countries_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
0,AFG,Asia,Afghanistan,2020-02-24,5.0,,5.0,,,,,,,39835428.0,18.6,0.5
1,AFG,Asia,Afghanistan,2020-02-25,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
2,AFG,Asia,Afghanistan,2020-02-26,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
3,AFG,Asia,Afghanistan,2020-02-27,0.0,,5.0,,,,,,,39835428.0,18.6,0.5
4,AFG,Asia,Afghanistan,2020-02-28,0.0,,5.0,,,,,,,39835428.0,18.6,0.5


In [4]:
### Selection the continent we want to Analyse (America is divided in North America and South America)
continents = "Africa", "Asia", "Europe","North America", "Oceania", "South America"
continents_df = countries_df[countries_df['location'].isin(continents)]
continents_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
695,OWID_AFR,,Africa,2020-02-13,0.0,0.0,,,,,,,,1373486000.0,,
696,OWID_AFR,,Africa,2020-02-14,1.0,0.0,1.0,,,,,,,1373486000.0,,
697,OWID_AFR,,Africa,2020-02-15,0.0,0.0,1.0,,,,,,,1373486000.0,,
698,OWID_AFR,,Africa,2020-02-16,0.0,0.0,1.0,,,,,,,1373486000.0,,
699,OWID_AFR,,Africa,2020-02-17,0.0,0.0,1.0,,,,,,,1373486000.0,,


In [6]:
### In order to clean this data, we didn't want to drop the null so we decided to replace it with zero instead.
continents_df.replace(np.nan, 0, inplace = True)
continents_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
695,OWID_AFR,0,Africa,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
696,OWID_AFR,0,Africa,2020-02-14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
697,OWID_AFR,0,Africa,2020-02-15,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
698,OWID_AFR,0,Africa,2020-02-16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0
699,OWID_AFR,0,Africa,2020-02-17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1373486000.0,0.0,0.0


In [7]:
# When the 'location' is the continent then the continent column is empty so we can drop that column
continents_df = continents_df.drop(columns="continent")
continents_df

Unnamed: 0,iso_code,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
695,OWID_AFR,Africa,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.373486e+09,0.0,0.0
696,OWID_AFR,Africa,2020-02-14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.373486e+09,0.0,0.0
697,OWID_AFR,Africa,2020-02-15,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.373486e+09,0.0,0.0
698,OWID_AFR,Africa,2020-02-16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.373486e+09,0.0,0.0
699,OWID_AFR,Africa,2020-02-17,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.373486e+09,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130477,OWID_SAM,South America,2022-01-14,401816.0,1076.0,42571736.0,1197256.0,0.0,0.0,0.0,0.0,2214096.0,4.342601e+08,0.0,0.0
130478,OWID_SAM,South America,2022-01-15,251286.0,502.0,42823022.0,1197758.0,0.0,0.0,0.0,0.0,1016949.0,4.342601e+08,0.0,0.0
130479,OWID_SAM,South America,2022-01-16,224231.0,508.0,43047253.0,1198266.0,0.0,0.0,0.0,0.0,3359932.0,4.342601e+08,0.0,0.0
130480,OWID_SAM,South America,2022-01-17,252704.0,687.0,43299957.0,1198953.0,0.0,0.0,0.0,0.0,4207789.0,4.342601e+08,0.0,0.0


In [8]:
# Export the data
continents_df.to_csv("continent_cleaned_data_Mouhamadou.csv", index = False)