In [11]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression 
from matplotlib import pyplot as plt


from pathlib import Path

In [12]:
  # Loading data
file_path = Path("owid-covid-data.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [13]:
# Selecting specific columns using loc() method
selection = df.loc[:164946,['date','location','continent','new_cases_smoothed','icu_patients','icu_patients_per_million','new_deaths_smoothed','new_vaccinations_smoothed','people_fully_vaccinated_per_hundred','total_boosters_per_hundred','new_people_vaccinated_smoothed_per_hundred']]
selection.head()

Unnamed: 0,date,location,continent,new_cases_smoothed,icu_patients,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_people_vaccinated_smoothed_per_hundred
0,2020-02-24,Afghanistan,Asia,,,,,,,,
1,2020-02-25,Afghanistan,Asia,,,,,,,,
2,2020-02-26,Afghanistan,Asia,,,,,,,,
3,2020-02-27,Afghanistan,Asia,,,,,,,,
4,2020-02-28,Afghanistan,Asia,,,,,,,,


In [14]:
# checking for nulls
selection.isnull().sum()

date                                               0
location                                           0
continent                                       9878
new_cases_smoothed                              4248
icu_patients                                  141710
icu_patients_per_million                      141710
new_deaths_smoothed                            20803
new_vaccinations_smoothed                      81539
people_fully_vaccinated_per_hundred           125166
total_boosters_per_hundred                    147909
new_people_vaccinated_smoothed_per_hundred     82784
dtype: int64

In [15]:
# dropping nulls
clean_selection = selection.dropna()

In [16]:
# looking for duplicates
print(f'Duplicate entries: {clean_selection.duplicated().sum()}')

Duplicate entries: 0


In [17]:
# checking the dataframe as a whole
clean_selection.head()

Unnamed: 0,date,location,continent,new_cases_smoothed,icu_patients,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_people_vaccinated_smoothed_per_hundred
2850,2021-11-21,Algeria,Africa,140.286,23.0,0.516,5.286,79020.0,11.88,0.03,0.07
2854,2021-11-25,Algeria,Africa,156.0,22.0,0.493,4.571,43565.0,11.97,0.05,0.036
2858,2021-11-29,Algeria,Africa,176.143,17.0,0.381,5.429,16190.0,12.06,0.06,0.012
6490,2021-11-15,Argentina,South America,1298.571,565.0,12.389,15.286,356203.0,60.26,1.96,0.187
6492,2021-11-17,Argentina,South America,1335.571,572.0,12.542,18.429,400490.0,61.08,2.29,0.167


In [18]:
# checking info on rows and columns of new dataframe
clean_selection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5863 entries, 2850 to 156684
Data columns (total 11 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   date                                        5863 non-null   object 
 1   location                                    5863 non-null   object 
 2   continent                                   5863 non-null   object 
 3   new_cases_smoothed                          5863 non-null   float64
 4   icu_patients                                5863 non-null   float64
 5   icu_patients_per_million                    5863 non-null   float64
 6   new_deaths_smoothed                         5863 non-null   float64
 7   new_vaccinations_smoothed                   5863 non-null   float64
 8   people_fully_vaccinated_per_hundred         5863 non-null   float64
 9   total_boosters_per_hundred                  5863 non-null   float64
 10  new_peo

In [19]:
# saving the dataframe as a csv
output_file_path= "clean_selected_covid_data2.csv"
clean_selection.to_csv(output_file_path, index=False)