In [2]:
import pandas as pd
import numpy as np

In [3]:
### Loading the data
df = pd.read_csv("C:/Users/andre/Documents/Analysis_Projects/FinalProject/owid-covid-data.csv")
df.head()

### We also want to make sure we fix the date column.
df["date"] = pd.to_datetime(df["date"])
df.replace(np.nan, 0, inplace = True)
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,37.746,0.5,64.83,0.511,0.0,0.0,0.0,0.0


In [4]:
### We noticed that there are some rows where the continent and the location are the same which deters us from
### analyzing the correct data. ISO_codes with OWID abbreviations have the data of the entire continent so we cannot use
### that

### So what we do is clean the data. 
countries_df = df[df.continent != 0]
countries_df.head()

countries_df = countries_df[["iso_code", "continent", "location", "date", "new_cases", "new_deaths", "total_cases", "total_deaths", "reproduction_rate", "hosp_patients", "new_tests", "positive_rate", "new_vaccinations", "population", "median_age", "hospital_beds_per_thousand" ]]

countries_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
0,AFG,Asia,Afghanistan,2020-02-24,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
1,AFG,Asia,Afghanistan,2020-02-25,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
2,AFG,Asia,Afghanistan,2020-02-26,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
3,AFG,Asia,Afghanistan,2020-02-27,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
4,AFG,Asia,Afghanistan,2020-02-28,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5


In [None]:
### In order to clean this data, we didn't want to drop the null so we decided to replace it with zero instead.
countries_df.to_csv("cleaned_data_andrea.csv", index = False)

In [64]:
grouped_df = countries_df.groupby("location")
grouped_lists = grouped_df["total_cases"].mean()
grouped_lists = grouped_lists.reset_index()

# grouped_lists.describe()

grouped_lists = grouped_lists.sort_values(by=["total_cases"])

# Grouping the list by average of total_cases
low = grouped_lists[0:77]
med = grouped_lists[78:155]
high = grouped_lists[156:232]

In [65]:
# Creating a new column to determine
countries_df['risk_level'] = ['low' if x<=max(low["total_cases"]) else 'med' if min(med["total_cases"])<=x<=max(med["total_cases"]) else 'high' for x in countries_df['total_cases']]
countries_df

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand,risk_level
0,AFG,Asia,Afghanistan,2020-02-24,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0000,0.0,39835428.0,18.6,0.5,low
1,AFG,Asia,Afghanistan,2020-02-25,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0000,0.0,39835428.0,18.6,0.5,low
2,AFG,Asia,Afghanistan,2020-02-26,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0000,0.0,39835428.0,18.6,0.5,low
3,AFG,Asia,Afghanistan,2020-02-27,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0000,0.0,39835428.0,18.6,0.5,low
4,AFG,Asia,Afghanistan,2020-02-28,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0000,0.0,39835428.0,18.6,0.5,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153388,ZWE,Africa,Zimbabwe,2022-01-05,1379.0,14.0,219057.0,5092.0,0.0,0.0,6885.0,0.2811,15571.0,15092171.0,19.6,1.7,high
153389,ZWE,Africa,Zimbabwe,2022-01-06,1121.0,16.0,220178.0,5108.0,0.0,0.0,6048.0,0.2119,13877.0,15092171.0,19.6,1.7,high
153390,ZWE,Africa,Zimbabwe,2022-01-07,1104.0,28.0,221282.0,5136.0,0.0,0.0,6723.0,0.1983,14736.0,15092171.0,19.6,1.7,high
153391,ZWE,Africa,Zimbabwe,2022-01-08,636.0,12.0,221918.0,5148.0,0.0,0.0,0.0,0.0000,8238.0,15092171.0,19.6,1.7,high


In [None]:
### Grouping countries into levels


In [None]:
continents_df = countries_df.loc[countries_df["continent"] == countries_df["location"]]

In [None]:
### Not sure if I want to use these dataframes

asia_df = countries_df[countries_df["continent"] == "Asia"]
europe_df = countries_df[countries_df["continent"] == "Europe"]
na_df = countries_df[countries_df["continent"] == "North America"]
sa_df = countries_df[countries_df["continent"] == "South America"]
oceania_df = countries_df[countries_df["continent"] == "Oceania"]
africa_df = countries_df[countries_df["continent"] == "Africa"]