In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression 
from matplotlib import pyplot as plt


from pathlib import Path

In [3]:
  # Loading data
file_path = Path("owid-covid-data.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [6]:
# Selecting specific columns using loc() method
selection = df.loc[:164946,['date','location','continent','new_cases_smoothed','icu_patients','icu_patients_per_million','new_deaths_smoothed','new_vaccinations_smoothed',]]
selection.head()

Unnamed: 0,date,location,continent,new_cases_smoothed,icu_patients,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed
0,2020-02-24,Afghanistan,Asia,,,,,
1,2020-02-25,Afghanistan,Asia,,,,,
2,2020-02-26,Afghanistan,Asia,,,,,
3,2020-02-27,Afghanistan,Asia,,,,,
4,2020-02-28,Afghanistan,Asia,,,,,


In [7]:
# checking for nulls
selection.isnull().sum()

date                              0
location                          0
continent                      9735
new_cases_smoothed             4145
icu_patients                 139543
icu_patients_per_million     139543
new_deaths_smoothed           20594
new_vaccinations_smoothed     81532
dtype: int64

In [8]:
# dropping nulls
clean_selection = selection.dropna()

In [9]:
# looking for duplicates
print(f'Duplicate entries: {clean_selection.duplicated().sum()}')

Duplicate entries: 0


In [10]:
# checking the dataframe as a whole
clean_selection.head()

Unnamed: 0,date,location,continent,new_cases_smoothed,icu_patients,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed
2522,2021-01-30,Algeria,Africa,250.429,33.0,0.74,3.857,30.0
2524,2021-02-01,Algeria,Africa,246.286,29.0,0.65,4.0,2509.0
2525,2021-02-02,Algeria,Africa,249.143,23.0,0.516,3.857,2819.0
2528,2021-02-05,Algeria,Africa,248.857,21.0,0.471,3.571,3217.0
2531,2021-02-08,Algeria,Africa,247.857,25.0,0.56,3.429,3748.0


In [11]:
# checking info on rows and columns of new dataframe
clean_selection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13856 entries, 2522 to 154244
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   date                       13856 non-null  object 
 1   location                   13856 non-null  object 
 2   continent                  13856 non-null  object 
 3   new_cases_smoothed         13856 non-null  float64
 4   icu_patients               13856 non-null  float64
 5   icu_patients_per_million   13856 non-null  float64
 6   new_deaths_smoothed        13856 non-null  float64
 7   new_vaccinations_smoothed  13856 non-null  float64
dtypes: float64(5), object(3)
memory usage: 974.2+ KB


In [12]:
# saving the dataframe as a csv
output_file_path= "clean_selected_covid_data2.csv"
clean_selection.to_csv(output_file_path, index=False)