In [1]:
import pandas as pd
import numpy as np
import random as rd
import scipy.stats
import statsmodels.api as sm

In [2]:
### Loading the data
df = pd.read_csv("C:/Users/andre/Documents/Analysis_Projects/FinalProject/owid-covid-data.csv")
df.head()

### We also want to make sure we fix the date column.
df["date"] = pd.to_datetime(df["date"])
df.replace(np.nan, 0, inplace = True)
countries_df = df[df.continent != 0]
countries_df = countries_df.loc[countries_df["location"] != countries_df["continent"]]

In [3]:
### Take only the columns we want

countries_df = countries_df[["iso_code", "continent", "location", "date", "new_cases", "new_deaths", "total_cases", "total_deaths", "reproduction_rate", "hosp_patients", "new_tests", "total_tests", "positive_rate", "new_vaccinations", "population", "median_age", "hospital_beds_per_thousand" ]]

countries_df.head()

Unnamed: 0,iso_code,continent,location,date,new_cases,new_deaths,total_cases,total_deaths,reproduction_rate,hosp_patients,new_tests,total_tests,positive_rate,new_vaccinations,population,median_age,hospital_beds_per_thousand
0,AFG,Asia,Afghanistan,2020-02-24,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
1,AFG,Asia,Afghanistan,2020-02-25,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
2,AFG,Asia,Afghanistan,2020-02-26,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
3,AFG,Asia,Afghanistan,2020-02-27,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5
4,AFG,Asia,Afghanistan,2020-02-28,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39835428.0,18.6,0.5


In [4]:
### In order to clean this data, we didn't want to drop the null so we decided to replace it with zero instead.
countries_df.to_csv("cleaned_data_andrea.csv", index = False)

In [5]:
### Grouping the countries in levels of risk of COVID by looking at the total cases.

grouped_df = countries_df.groupby("location")
grouped_lists = grouped_df["total_cases"].mean()
grouped_lists = grouped_lists.reset_index()

# grouped_lists.describe()

grouped_lists = grouped_lists.sort_values(by=["total_cases"])

# Grouping the list by aver  age of total_cases
low = grouped_lists[0:75]
med = grouped_lists[75:150]
high = grouped_lists[150:]

In [6]:
# Creating a new column to determine
countries_df['risk_level'] = ['low' if x<=max(low["total_cases"]) else 'med' if min(med["total_cases"])<=x<=max(med["total_cases"]) else 'high' for x in countries_df['total_cases']]

In [7]:
### Get 3 countries out of each
def pick_three():
    a = []
    
    low_cn = low.sample()
    low_cn = low_cn["location"].to_string()
    
    med_cn = med.sample()
    med_cn = med_cn["location"].to_string()
    
    high_cn = high.sample()
    high_cn = high_cn["location"].to_string()
    
    a.append(low_cn)
    a.append(med_cn)
    a.append(high_cn)
    
    new = []
    for index in a:
        for i,x in enumerate(index):
            if x.isalpha():      #True if its a letter
                pos = i 
                break
        b = index[pos:]
        new.append(b)
    
    return new

In [8]:
### Testing the function pick_three()
a = pick_three()
a

['Bermuda', 'Madagascar', 'Morocco']

In [12]:
### Now we can get to the ML Model

In [9]:
def make_df():
    countries = pick_three()
    new_df = pd.DataFrame(columns = countries_df.columns)
    entries = []

    for i in countries_df["location"]:
        if i in countries:
            entry = countries_df.loc[countries_df['location'] == i]
            entries.append(entry)

    found_df = pd.concat(entries)
    result_df = pd.concat([new_df, found_df])
    
    return result_df

In [10]:
### Now, we obviously want to see the correlation
def tests_vs_cases():
    a = make_df()
    data1 = a.loc[a["location"] == a["location"].unique()[0]]
    data2 = a.loc[a["location"] == a["location"].unique()[1]]
    data3 = a.loc[a["location"] == a["location"].unique()[2]]
    
    data1_col1 = data1["total_tests"]
    data1_col2 = data1["total_cases"]

    data2_col1 = data2["total_tests"]
    data2_col2 = data2["total_cases"]

    data3_col1 = data3["total_tests"]
    data3_col2 = data3["total_cases"]

    data1_corr = data1_col1.corr(data1_col2)
    data2_corr = data2_col1.corr(data2_col2)
    data3_corr = data3_col1.corr(data3_col2)
        
    
    ### Now let's see if we can obtain the entire summary statistics for the linear regression model.
    total_tests_1 = data1_col1.to_list()
    total_cases_1 = data1_col2.to_list()
    total_tests_1, total_cases_1 = np.array(total_tests_1), np.array(total_cases_1)
    total_tests_1 = sm.add_constant(total_tests_1)
    model1 = sm.OLS(total_cases_1, total_tests_1)
    results1 = model1.fit()
    
    print(f"Low Risk: The correlation coefficient between the total tests administered and the total cases in " + data1["location"].unique()[0] + " is " + str(data1_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data1["location"].unique()[0] + ".")
    print(results1.summary())
    print("------------------------------------------------------------------------------------------------")

    
    total_tests_2 = data2_col1.to_list()
    total_cases_2 = data2_col2.to_list()
    total_tests_2, total_cases_2 = np.array(total_tests_2), np.array(total_cases_2)
    total_tests_2 = sm.add_constant(total_tests_2)
    model2 = sm.OLS(total_cases_2, total_tests_2)
    results2 = model2.fit()
    
    print(f"Med Risk: The correlation coefficient between the total tests administered and the total cases in " + data2["location"].unique()[0] + " is " + str(data2_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data2["location"].unique()[0] + ".")
    print(results2.summary())
    print("------------------------------------------------------------------------------------------------")

    total_tests_3 = data3_col1.to_list()
    total_cases_3 = data3_col2.to_list()
    total_tests_3, total_cases_3 = np.array(total_tests_3), np.array(total_cases_3)
    total_tests_3 = sm.add_constant(total_tests_3)
    model3 = sm.OLS(total_cases_3, total_tests_3)
    results3 = model3.fit()
    
    print(f"High Risk: The correlation coefficient between the total tests administered and the total cases in " + data3["location"].unique()[0] + " is " + str(data3_corr))
    print("------------------------------------------------------------------------------------------------")    
    print(f"Model summary for: " + data3["location"].unique()[0] + ".")
    print(results3.summary())
    print("------------------------------------------------------------------------------------------------")

    return a

In [12]:
test = tests_vs_cases()

Low Risk: The correlation coefficient between the total tests administered and the total cases in Brazil is 0.3128274230953361
------------------------------------------------------------------------------------------------
Model summary for: Brazil.
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.098
Model:                            OLS   Adj. R-squared:                  0.098
Method:                 Least Squares   F-statistic:                 5.210e+04
Date:                Mon, 31 Jan 2022   Prob (F-statistic):               0.00
Time:                        19:36:33   Log-Likelihood:            -8.3006e+06
No. Observations:              480249   AIC:                         1.660e+07
Df Residuals:                  480247   BIC:                         1.660e+07
Df Model:                           1                                         
Covariance Type:            nonrobust 

# Now we will move on to seeing whether the tests and deaths

In [20]:
def tests_vs_deaths():
    a = make_df()
    data1 = a.loc[a["location"] == a["location"].unique()[0]]
    data2 = a.loc[a["location"] == a["location"].unique()[1]]
    data3 = a.loc[a["location"] == a["location"].unique()[2]]
    
    data1_col1 = data1["total_tests"]
    data1_col2 = data1["total_deaths"]

    data2_col1 = data2["total_tests"]
    data2_col2 = data2["total_deaths"]

    data3_col1 = data3["total_tests"]
    data3_col2 = data3["total_deaths"]

    data1_corr = data1_col1.corr(data1_col2)
    data2_corr = data2_col1.corr(data2_col2)
    data3_corr = data3_col1.corr(data3_col2)
        
    
    ### Now let's see if we can obtain the entire summary statistics for the linear regression model.
    total_tests_1 = data1_col1.to_list()
    total_deaths_1 = data1_col2.to_list()
    total_tests_1, total_deaths_1 = np.array(total_tests_1), np.array(total_deaths_1)
    total_tests_1 = sm.add_constant(total_tests_1)
    model1 = sm.OLS(total_deaths_1, total_tests_1)
    results1 = model1.fit()
    
    print(f"Low Risk: The correlation coefficient between the total tests administered and the total deaths in " + data1["location"].unique()[0] + " is " + str(data1_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data1["location"].unique()[0] + ".")
    print(results1.summary())
    print("------------------------------------------------------------------------------------------------")

    
    total_tests_2 = data2_col1.to_list()
    total_deaths_2 = data2_col2.to_list()
    total_tests_2, total_deaths_2 = np.array(total_tests_2), np.array(total_deaths_2)
    total_tests_2 = sm.add_constant(total_tests_2)
    model2 = sm.OLS(total_deaths_2, total_tests_2)
    results2 = model2.fit()
    
    print(f"Med Risk: The correlation coefficient between the total tests administered and the total deaths in " + data2["location"].unique()[0] + " is " + str(data2_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data2["location"].unique()[0] + ".")
    print(results2.summary())
    print("------------------------------------------------------------------------------------------------")

    total_tests_3 = data3_col1.to_list()
    total_deaths_3 = data3_col2.to_list()
    total_tests_3, total_deaths_3 = np.array(total_tests_3), np.array(total_deaths_3)
    total_tests_3 = sm.add_constant(total_tests_3)
    model3 = sm.OLS(total_deaths_3, total_tests_3)
    results3 = model3.fit()
    
    print(f"High Risk: The correlation coefficient between the total tests administered and the total deaths in " + data3["location"].unique()[0] + " is " + str(data3_corr))
    print("------------------------------------------------------------------------------------------------")    
    print(f"Model summary for: " + data3["location"].unique()[0] + ".")
    print(results3.summary())
    print("------------------------------------------------------------------------------------------------")

    return a

In [21]:
deaths = tests_vs_deaths()

Low Risk: The correlation coefficient between the total tests administered and the total deaths in Laos is 0.13087276256137512
------------------------------------------------------------------------------------------------
Model summary for: Laos.
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     7729.
Date:                Sun, 30 Jan 2022   Prob (F-statistic):               0.00
Time:                        13:28:15   Log-Likelihood:            -2.6280e+06
No. Observations:              443556   AIC:                         5.256e+06
Df Residuals:                  443554   BIC:                         5.256e+06
Df Model:                           1                                         
Covariance Type:            nonrobust   

  return np.sqrt(eigvals[0]/eigvals[-1])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)


High Risk: The correlation coefficient between the total tests administered and the total deaths in Vietnam is 0.6409363600651233
------------------------------------------------------------------------------------------------
Model summary for: Vietnam.
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.411
Model:                            OLS   Adj. R-squared:                  0.411
Method:                 Least Squares   F-statistic:                 3.685e+05
Date:                Sun, 30 Jan 2022   Prob (F-statistic):               0.00
Time:                        13:28:15   Log-Likelihood:            -5.4651e+06
No. Observations:              528529   AIC:                         1.093e+07
Df Residuals:                  528527   BIC:                         1.093e+07
Df Model:                           1                                         
Covariance Type:            nonrob

In [23]:
def vacs_vs_cases():
    a = make_df()
    data1 = a.loc[a["location"] == a["location"].unique()[0]]
    data2 = a.loc[a["location"] == a["location"].unique()[1]]
    data3 = a.loc[a["location"] == a["location"].unique()[2]]
    
    data1_col1 = data1["new_vaccinations"]
    data1_col2 = data1["total_cases"]

    data2_col1 = data2["new_vaccinations"]
    data2_col2 = data2["total_cases"]

    data3_col1 = data3["new_vaccinations"]
    data3_col2 = data3["total_cases"]

    data1_corr = data1_col1.corr(data1_col2)
    data2_corr = data2_col1.corr(data2_col2)
    data3_corr = data3_col1.corr(data3_col2)
        
    
    ### Now let's see if we can obtain the entire summary statistics for the linear regression model.
    new_vacs_1 = data1_col1.to_list()
    total_cases_1 = data1_col2.to_list()
    new_vacs_1, total_cases_1 = np.array(new_vacs_1), np.array(total_cases_1)
    new_vacs_1 = sm.add_constant(new_vacs_1)
    model1 = sm.OLS(total_cases_1, new_vacs_1)
    results1 = model1.fit()
    
    print(f"Low Risk: The correlation coefficient between the new vaccines administered and the total cases in " + data1["location"].unique()[0] + " is " + str(data1_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data1["location"].unique()[0] + ".")
    print(results1.summary())
    print("------------------------------------------------------------------------------------------------")

    
    total_tests_2 = data2_col1.to_list()
    total_deaths_2 = data2_col2.to_list()
    total_tests_2, total_deaths_2 = np.array(total_tests_2), np.array(total_deaths_2)
    total_tests_2 = sm.add_constant(total_tests_2)
    model2 = sm.OLS(total_deaths_2, total_tests_2)
    results2 = model2.fit()
    
    print(f"Med Risk: The correlation coefficient between the new vaccines administered and the total cases in " + data2["location"].unique()[0] + " is " + str(data2_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data2["location"].unique()[0] + ".")
    print(results2.summary())
    print("------------------------------------------------------------------------------------------------")

    total_tests_3 = data3_col1.to_list()
    total_deaths_3 = data3_col2.to_list()
    total_tests_3, total_deaths_3 = np.array(total_tests_3), np.array(total_deaths_3)
    total_tests_3 = sm.add_constant(total_tests_3)
    model3 = sm.OLS(total_deaths_3, total_tests_3)
    results3 = model3.fit()
    
    print(f"High Risk: The correlation coefficient between the new vaccines administered and the total cases in " + data3["location"].unique()[0] + " is " + str(data3_corr))
    print("------------------------------------------------------------------------------------------------")    
    print(f"Model summary for: " + data3["location"].unique()[0] + ".")
    print(results3.summary())
    print("------------------------------------------------------------------------------------------------")

    return a

In [24]:
vacscases = vacs_vs_cases()

Low Risk: The correlation coefficient between the new vaccines administered and the total cases in Gambia is 0.05125489634418376
------------------------------------------------------------------------------------------------
Model summary for: Gambia.
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1193.
Date:                Sun, 30 Jan 2022   Prob (F-statistic):          4.44e-261
Time:                        13:34:50   Log-Likelihood:            -4.3392e+06
No. Observations:              452929   AIC:                         8.678e+06
Df Residuals:                  452927   BIC:                         8.678e+06
Df Model:                           1                                         
Covariance Type:            nonrobus

In [25]:
def vacs_vs_deaths():
    a = make_df()
    data1 = a.loc[a["location"] == a["location"].unique()[0]]
    data2 = a.loc[a["location"] == a["location"].unique()[1]]
    data3 = a.loc[a["location"] == a["location"].unique()[2]]
    
    data1_col1 = data1["new_vaccinations"]
    data1_col2 = data1["total_deaths"]

    data2_col1 = data2["new_vaccinations"]
    data2_col2 = data2["total_deaths"]

    data3_col1 = data3["new_vaccinations"]
    data3_col2 = data3["total_deaths"]

    data1_corr = data1_col1.corr(data1_col2)
    data2_corr = data2_col1.corr(data2_col2)
    data3_corr = data3_col1.corr(data3_col2)
        
    
    ### Now let's see if we can obtain the entire summary statistics for the linear regression model.
    new_vacs_1 = data1_col1.to_list()
    total_deaths_1 = data1_col2.to_list()
    new_vacs_1, total_deaths_1 = np.array(new_vacs_1), np.array(total_deaths_1)
    new_vacs_1 = sm.add_constant(new_vacs_1)
    model1 = sm.OLS(total_deaths_1, new_vacs_1)
    results1 = model1.fit()
    
    print(f"Low Risk: The correlation coefficient between the new vaccinations administered and the total deaths in " + data1["location"].unique()[0] + " is " + str(data1_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data1["location"].unique()[0] + ".")
    print(results1.summary())
    print("------------------------------------------------------------------------------------------------")

    
    new_vacs_2 = data2_col1.to_list()
    total_deaths_2 = data2_col2.to_list()
    new_vacs_2, total_deaths_2 = np.array(new_vacs_2), np.array(total_deaths_2)
    new_vacs_2 = sm.add_constant(new_vacs_2)
    model2 = sm.OLS(total_deaths_2, new_vacs_2)
    results2 = model2.fit()
    
    print(f"Med Risk: The correlation coefficient between the new vaccinations administered and the total deaths in " + data2["location"].unique()[0] + " is " + str(data2_corr))
    print("------------------------------------------------------------------------------------------------")
    print(f"Model summary for: " + data2["location"].unique()[0] + ".")
    print(results2.summary())
    print("------------------------------------------------------------------------------------------------")

    total_tests_3 = data3_col1.to_list()
    total_deaths_3 = data3_col2.to_list()
    total_tests_3, total_deaths_3 = np.array(total_tests_3), np.array(total_deaths_3)
    total_tests_3 = sm.add_constant(total_tests_3)
    model3 = sm.OLS(total_deaths_3, total_tests_3)
    results3 = model3.fit()
    
    print(f"High Risk: The correlation coefficient between the new vaccinations administered and the total deaths in " + data3["location"].unique()[0] + " is " + str(data3_corr))
    print("------------------------------------------------------------------------------------------------")    
    print(f"Model summary for: " + data3["location"].unique()[0] + ".")
    print(results3.summary())
    print("------------------------------------------------------------------------------------------------")

    return a

In [34]:
vacsdeaths = vacs_vs_deaths()

Low Risk: The correlation coefficient between the new vaccinations administered and the total deaths in Denmark is 0.5974160293132358
------------------------------------------------------------------------------------------------
Model summary for: Denmark.
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.357
Model:                            OLS   Adj. R-squared:                  0.357
Method:                 Least Squares   F-statistic:                 2.853e+05
Date:                Sun, 30 Jan 2022   Prob (F-statistic):               0.00
Time:                        13:49:22   Log-Likelihood:            -4.1988e+06
No. Observations:              514089   AIC:                         8.398e+06
Df Residuals:                  514087   BIC:                         8.398e+06
Df Model:                           1                                         
Covariance Type:            no

In [None]:
### These would be used for Tableau

asia_df = countries_df[countries_df["continent"] == "Asia"]
europe_df = countries_df[countries_df["continent"] == "Europe"]
na_df = countries_df[countries_df["continent"] == "North America"]
sa_df = countries_df[countries_df["continent"] == "South America"]
oceania_df = countries_df[countries_df["continent"] == "Oceania"]
africa_df = countries_df[countries_df["continent"] == "Africa"]

asia_df.to_csv("asia_df.csv", index = False)
europe_df.to_csv("europe_df.csv", index = False)
na_df.to_csv("northamerica_df.csv", index = False)
sa_df.to_csv("southamerica_df.csv", index = False)
oceania_df.to_csv("oceania_df.csv", index = False)
africa_df.to_csv("africa_df.csv", index = False)