In [76]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [77]:
# Loading data
file_path = Path("owid-covid-data.csv")
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [78]:
# Selecting specific columns using loc() method
selection = df.loc[:164946,['date','icu_patients_per_million','new_deaths_smoothed','new_vaccinations_smoothed_per_million', 'gdp_per_capita']]
selection.head()

Unnamed: 0,date,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed_per_million,gdp_per_capita
0,2020-02-24,,,,1803.987
1,2020-02-25,,,,1803.987
2,2020-02-26,,,,1803.987
3,2020-02-27,,,,1803.987
4,2020-02-28,,,,1803.987


In [79]:
# looking at info on our selected columns
selection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164945 entries, 0 to 164944
Data columns (total 5 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   date                                   164945 non-null  object 
 1   icu_patients_per_million               23235 non-null   float64
 2   new_deaths_smoothed                    144142 non-null  float64
 3   new_vaccinations_smoothed_per_million  83406 non-null   float64
 4   gdp_per_capita                         137355 non-null  float64
dtypes: float64(4), object(1)
memory usage: 6.3+ MB


In [80]:
# checking for nulls
selection.isnull().sum()

date                                          0
icu_patients_per_million                 141710
new_deaths_smoothed                       20803
new_vaccinations_smoothed_per_million     81539
gdp_per_capita                            27590
dtype: int64

In [81]:
# dropping nulls
clean_selection = selection.dropna()

In [82]:
# double checking there are no nulls after drop
clean_selection.isnull().sum()

date                                     0
icu_patients_per_million                 0
new_deaths_smoothed                      0
new_vaccinations_smoothed_per_million    0
gdp_per_capita                           0
dtype: int64

In [83]:
# looking for duplicates
print(f'Duplicate entries: {clean_selection.duplicated().sum()}')

Duplicate entries: 0


In [84]:
# checking the dataframe as a whole
clean_selection.head()

Unnamed: 0,date,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed_per_million,gdp_per_capita
2555,2021-01-30,0.74,3.857,1.0,13913.839
2557,2021-02-01,0.65,4.0,56.0,13913.839
2558,2021-02-02,0.516,3.857,63.0,13913.839
2561,2021-02-05,0.471,3.571,72.0,13913.839
2564,2021-02-08,0.56,3.429,84.0,13913.839


In [85]:
# checking info on rows and columns of new dataframe
clean_selection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14230 entries, 2555 to 156684
Data columns (total 5 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   date                                   14230 non-null  object 
 1   icu_patients_per_million               14230 non-null  float64
 2   new_deaths_smoothed                    14230 non-null  float64
 3   new_vaccinations_smoothed_per_million  14230 non-null  float64
 4   gdp_per_capita                         14230 non-null  float64
dtypes: float64(4), object(1)
memory usage: 667.0+ KB


In [86]:
# saving the dataframe as a csv
output_file_path= "clean_selected_covid_data.csv"
clean_selection.to_csv(output_file_path, index=False)

In [87]:
# preparing the data to meet scikit-learn library requirements
X = clean_selection.new_vaccinations_smoothed_per_million.values.reshape(-1, 1)

In [88]:
clean_selection.drop("date", axis=1, inplace=True)
clean_selection.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed_per_million,gdp_per_capita
2555,0.74,3.857,1.0,13913.839
2557,0.65,4.0,56.0,13913.839
2558,0.516,3.857,63.0,13913.839
2561,0.471,3.571,72.0,13913.839
2564,0.56,3.429,84.0,13913.839


In [89]:
#clean_selection.drop("location", axis=1, inplace=True)
#clean_selection.head()

In [90]:
#X.head(10)X = pd.get_dummies(clean_selection, columns=["new_vaccinations_smoothed_per_million", "icu_patients_per_million"])
#X.head(10)

In [91]:
#clean_scaled = StandardScaler().fit_transform(X)X = pd.get_dummies(clean_selection, columns=["new_vaccinations_smoothed_per_million", "icu_patients_per_million"])


In [92]:
#pca = PCA(n_components=3)
#clean_pca = pca.fit_transform(clean_scaled)

In [93]:
# assign target variable to y
#y = clean_selection.icu_patients_per_million

In [94]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
# Find the best value for K
#inertia = []
#k = list(range(1, 11))

# Calculate the inertia for the range of K values
#for i in k:
 #   km = KMeans(n_clusters=i, random_state=0)
  #  km.fit(clean_pca)
   # inertia.append(km.inertia_)

# Create the elbow curve
#elbow_data = {"k": k, "inertia": inertia}
#df_elbow = pd.DataFrame(elbow_data)
#df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [95]:
# Initialize the K-Means model.
# YOUR CODE HERE
#model = KMeans(n_clusters=4, random_state=0)
# Fit the model
# YOUR CODE HERE
#model.fit(clean_pca)
# Predict clusters
# YOUR CODE HERE
#predictions = model.predict(clean_pca)

# 3d plot
fig = px.scatter_3d(
    clean_selection,
    x="new_vaccinations_smoothed_per_million",
    y="icu_patients_per_million",
    z="new_deaths",
    color="new_deaths",
    symbol="new_deaths",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [96]:
X = pd.get_dummies(clean_selection, columns=["new_vaccinations_smoothed_per_million", "gdp_per_capita"])
X.head(10)

Unnamed: 0,icu_patients_per_million,new_deaths_smoothed,new_vaccinations_smoothed_per_million_0.0,new_vaccinations_smoothed_per_million_1.0,new_vaccinations_smoothed_per_million_2.0,new_vaccinations_smoothed_per_million_3.0,new_vaccinations_smoothed_per_million_5.0,new_vaccinations_smoothed_per_million_6.0,new_vaccinations_smoothed_per_million_7.0,new_vaccinations_smoothed_per_million_8.0,...,gdp_per_capita_45229.245,gdp_per_capita_45436.686,gdp_per_capita_46682.515,gdp_per_capita_46949.283,gdp_per_capita_48472.545,gdp_per_capita_54225.446,gdp_per_capita_57410.166,gdp_per_capita_67335.293,gdp_per_capita_85535.383,gdp_per_capita_94277.965
2555,0.74,3.857,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2557,0.65,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2558,0.516,3.857,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2561,0.471,3.571,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2564,0.56,3.429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2566,0.56,3.714,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2568,0.493,3.286,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2569,0.448,3.714,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2570,0.426,3.571,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2575,0.403,3.143,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
X.shape

(14230, 7756)

In [104]:
# assign target variable to y
y = clean_selection.icu_patients_per_million

In [105]:
y.shape

(14230,)

In [107]:
y

2555       0.740
2557       0.650
2558       0.516
2561       0.471
2564       0.560
           ...  
156680    31.413
156681    30.584
156682    28.917
156683    28.265
156684    26.797
Name: icu_patients_per_million, Length: 14230, dtype: float64

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1
                                                )
X_train.shape

(10672, 7756)