In [1]:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import sup_func as sf

#I get the df from website worldometers.info

url_popu = 'https://www.worldometers.info/world-population/population-by-country/'
url_covid = 'https://www.worldometers.info/coronavirus/#countries'
r_popu = requests.get(url_popu)
r_covid = requests.get(url_covid)
soup_popu = BeautifulSoup(r_popu.content)
soup_covid = BeautifulSoup(r_covid.content)
countries_popu = soup_popu.find_all('table')[0]
countries_covid = soup_covid.find_all('table')[0]
df_popu = pd.read_html(str(countries_popu))[0]
df_covid = pd.read_html(str(countries_covid))[0]

#I get the columns that I want to use from df_popu

df_raw = pd.DataFrame({'A' : []})
df_raw['Country (or dependency)'] = df_popu['Country (or dependency)']
df_raw['Population (2020)'] = df_popu['Population (2020)']
df_raw['Density (P/Km²)'] = df_popu['Density (P/Km²)']
df_raw['Land Area (Km²)'] = df_popu['Land Area (Km²)']
df_raw['Migrants (net)'] = df_popu['Migrants (net)']
df_raw['Med. Age'] = df_popu['Med. Age']
df_raw['Urban Pop %'] = df_popu['Urban Pop %']
df_raw = df_raw.dropna(how='all',axis=1)

#After checking both countries columns, I try match both columns with below dictionary
country_dic = {'USA':'United States', 'UK':'United Kingdom', 'S. Korea':'South Korea', 'UAE':'United Arab Emirates',
               'Czechia':'Czech Republic (Czechia)', 'Ivory Coast':"Côte d'Ivoire", 'DRC':'DR Congo',
               'Palestine':'State of Palestine', 'CAR':'Central African Republic', 'Saint Kitts and Nevis':'Saint Kitts & Nevis',
               'St. Vincent Grenadines':'St. Vincent & Grenadines', 'Vatican City':'Holy See', 'St. Barth':'Saint Barthelemy',
               'Sao Tome and Principe':'Sao Tome & Principe', 'Saint Pierre Miquelon':'Saint Pierre & Miquelon'}
df_covid['Country (or dependency)'] = df_covid['Country,Other']
df_covid.replace({'Country (or dependency)': country_dic},  inplace = True)
#I remove the columns from df_covid that I am not interested in 
df_subs = df_covid.drop([df_covid.columns[8],'Country,Other','NewCases','NewDeaths','ActiveCases','Serious,Critical','Tests/ 1M pop','Deaths/1M pop'],axis=1)

#I join both tables using as index the country
df_raw = df_raw.join(df_subs.set_index('Country (or dependency)'), on='Country (or dependency)')

#I have called dtypes to check where I have non-numeric columns to change its type.

#lets start for Urban POP, I change % values to floats, 100% is 1 0% is 0
indexes = df_raw.index.values.tolist()
for index in indexes:
    per = df_raw.loc[index,'Urban Pop %']
    if per == 'N.A.':
        df_raw.loc[index,'Urban Pop %'] = np.nan
    else:
        if type(per) is str:
            if per == '0 %':
                df_raw.loc[index,'Urban Pop %'] = 0
            else:
                df_raw.loc[index,'Urban Pop %'] = int(per[0:2])/100

df_raw = df_raw.astype({'Urban Pop %': 'float64'})
#Now I change the countries name to a number and replace the name with the number in the df
countries_list = df_raw['Country (or dependency)'].unique()
countries_list.sort()
countries_dic = dict()
k=0
for country in countries_list:
    countries_dic[country] = k
    k +=1
dic_reverse = {}
for key,values in countries_dic.items():
    dic_reverse[values] = key

df_raw.replace({'Country (or dependency)': countries_dic},  inplace = True)
#the last one is the Med Age. I change to a numeric, using coerce (imput NAN to errors)
df_raw['Med. Age'] = pd.to_numeric(df_raw['Med. Age'],errors='coerce')
#I am going to add a column of countries with free healthcare, migth be usefull
df_healthcare = pd.read_csv('countries with free healthcare.csv')
hc_edit = {'Czech Republic':'Czech Republic (Czechia)',
           'Macau':'Macao',
           'Saint Vincent and the Grenadines':'St. Vincent & Grenadines'}
df_healthcare.replace({'name': hc_edit},  inplace = True)
df_healthcare['free healthcare'] = 1
df_healthcare.drop(['pop2020'], axis=1, inplace=True)
df_healthcare.replace({'name': countries_dic},  inplace = True)
df_raw = df_raw.join(df_healthcare.set_index('name'), on='Country (or dependency)')
df_raw['free healthcare'].fillna(0, inplace=True)

#df_raw.dtypes
#now I am ready with my df, everything is a number


df_raw.dtypes

Country (or dependency)      int64
Population (2020)            int64
Density (P/Km²)              int64
Land Area (Km²)              int64
Migrants (net)             float64
Med. Age                   float64
Urban Pop %                float64
TotalCases                 float64
TotalDeaths                float64
TotalRecovered             float64
TotalTests                 float64
free healthcare            float64
dtype: object

In [2]:
popu= df_raw['Population (2020)'].unique()

dict_popu = {1: 'higher 1B', 2: 'between 100M and 1B', 3: 'between 30M and 100M', 4: 'between 10M and 30M', 5: 'between 1M and 10M', 6: 'less than 1M'}

dic_replace_popu = sf.get_cat_popu(popu)
print(dic_replace_popu)


{1439323776: 1, 1380004385: 1, 331002651: 2, 273523615: 2, 220892340: 2, 212559417: 2, 206139589: 2, 164689383: 2, 145934462: 2, 128932753: 2, 126476461: 2, 114963588: 2, 109581078: 2, 102334404: 2, 97338579: 3, 89561403: 3, 84339067: 3, 83992949: 3, 83783942: 3, 69799978: 3, 67886011: 3, 65273511: 3, 60461826: 3, 59734218: 3, 59308690: 3, 54409800: 3, 53771296: 3, 51269185: 3, 50882891: 3, 46754778: 3, 45741007: 3, 45195774: 3, 43851044: 3, 43849260: 3, 43733762: 3, 40222493: 3, 38928346: 3, 37846611: 3, 37742154: 3, 36910560: 3, 34813871: 3, 33469203: 3, 32971854: 3, 32866272: 3, 32365999: 3, 31255435: 3, 31072940: 3, 29825964: 4, 29136808: 4, 28435940: 4, 27691018: 4, 26545863: 4, 26378274: 4, 25778816: 4, 25499884: 4, 24206644: 4, 23816775: 4, 21413249: 4, 20903273: 4, 20250833: 4, 19237691: 4, 19129952: 4, 19116201: 4, 18776707: 4, 18383955: 4, 17915568: 4, 17643054: 4, 17500658: 4, 17134872: 4, 16743927: 4, 16718965: 4, 16425864: 4, 15893222: 4, 14862924: 4, 13132795: 4, 12952218

In [None]:
#3 evaluations, cases, deaths and recovered

df_cases = df_raw.drop(['TotalDeaths','TotalRecovered'],axis=1)
df_deaths = df_raw.drop(['TotalCases','TotalRecovered'],axis=1)
df_recovered = df_raw.drop(['TotalCases','TotalDeaths'],axis=1)

#lets remove the NAN.
df_cases = df_cases[df_cases['TotalCases'].isnull() == False]
df_deaths = df_deaths[df_deaths['TotalDeaths'].isnull() == False]
df_recovered = df_recovered[df_recovered['TotalRecovered'].isnull() == False]

In [None]:


#All below NAN values should be 0, nevertheless, I will consider them as 0
df['Migrants (net)'].fillna(0, inplace=True)
df['TotalDeaths'].fillna(0, inplace=True)
df['TotalRecovered'].fillna(0, inplace=True)
df['Deaths/1M pop'].fillna(0, inplace=True)
df['TotalTests'].fillna(0, inplace=True)
df['Tests/ 1M pop'].fillna(0, inplace=True)


df.head()

In [None]:
#I am going to add a column of countries with free healthcare, migth be usefull



y = df['TotalCases']

df['Med. Age'].fillna((df['Med. Age'].mean()), inplace=True)
df['Urban Pop %'].fillna((df['Urban Pop %'].mean()), inplace=True)
df1 = df.drop(['TotalCases'], axis=1)
X = df1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit

y_test_preds = lm_model.predict(X_test) 
y_train_preds = lm_model.predict(X_train)
#Rsquared and y_test
rsquared_score = r2_score(y_test, y_test_preds)
length_y_test = len(y_test)
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)


In [None]:
df.head()

In [None]:

df_temp = df_raw[df_raw['TotalCases'].isnull() == True]
num_pop_nan = sum(df_temp['Population (2020)'])
'NAN values for this column represents the {} % of the total population of the world'.format(num_pop_nan*100/np.sum(df_raw['Population (2020)']))



In [None]:
"The r-squared score for your model was {} on {} values.".format(rsquared_score, length_y_test)


In [None]:
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

In [None]:
df.hist();

In [None]:
sns.heatmap(df.corr(), annot= True, fmt='.2f');

In [None]:
df.dtypes


In [None]:
cutoffs = [300,200,100, 50, 30, 25,10,5]

r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test = find_optimal_lm_mod(X, y, cutoffs)

In [None]:

def find_optimal_lm_mod(X, y, cutoffs, test_size = .30, random_state=42, plot=True):
    '''
    INPUT
    X - pandas dataframe, X matrix
    y - pandas dataframe, response variable
    cutoffs - list of ints, cutoff for number of non-zero values in dummy categorical vars
    test_size - float between 0 and 1, default 0.3, determines the proportion of data as test data
    random_state - int, default 42, controls random state for train_test_split
    plot - boolean, default 0.3, True to plot result

    OUTPUT
    r2_scores_test - list of floats of r2 scores on the test data
    r2_scores_train - list of floats of r2 scores on the train data
    lm_model - model object from sklearn
    X_train, X_test, y_train, y_test - output from sklearn train test split used for optimal model
    '''
    r2_scores_test, r2_scores_train, num_feats, results = [], [], [], dict()
    for cutoff in cutoffs:

        #reduce X matrix
        reduce_X = X.iloc[:, np.where((X.sum() > cutoff) == True)[0]]
        num_feats.append(reduce_X.shape[1])

        #split the data into train and test
        X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)

        #fit the model and obtain pred response
        lm_model = LinearRegression(normalize=True)
        lm_model.fit(X_train, y_train)
        y_test_preds = lm_model.predict(X_test)
        y_train_preds = lm_model.predict(X_train)

        #append the r2 value from the test set
        r2_scores_test.append(r2_score(y_test, y_test_preds))
        r2_scores_train.append(r2_score(y_train, y_train_preds))
        results[str(cutoff)] = r2_score(y_test, y_test_preds)

    if plot:
        plt.plot(num_feats, r2_scores_test, label="Test", alpha=.5)
        plt.plot(num_feats, r2_scores_train, label="Train", alpha=.5)
        plt.xlabel('Number of Features')
        plt.ylabel('Rsquared')
        plt.title('Rsquared by Number of Features')
        plt.legend(loc=1)
        plt.show()

    best_cutoff = max(results, key=results.get)

    #reduce X matrix
    reduce_X = X.iloc[:, np.where((X.sum() > int(best_cutoff)) == True)[0]]
    num_feats.append(reduce_X.shape[1])

    #split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(reduce_X, y, test_size = test_size, random_state=random_state)

    #fit the model
    lm_model = LinearRegression(normalize=True)
    lm_model.fit(X_train, y_train)

    return r2_scores_test, r2_scores_train, lm_model, X_train, X_test, y_train, y_test