In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()


In [None]:
cleaned_data = pd.read_csv("cleaned_data.csv", index_col = 0)
cases = pd.read_csv("cases.csv", index_col = 0)

###K-Nearest Neighbours
The k-nearest neighbors (KNN) algorithm is a simple, easy-to-implement supervised machine learning algorithm that can be used to solve both classification and regression problems.

In [None]:
#split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#scaling data
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(cleaned_data)
y = sc_y.fit_transform(cases)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Using KFold

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=3)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics

cv = KFold(n_splits = 10, random_state = 1, shuffle = True)

scores = cross_val_score(knn_model, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores)*-1)

We then tune and optimize KNN by finding the best performing value of K

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

Best performing value of K

In [None]:
gridsearch.best_params_

In [None]:
scores1 = cross_val_score(gridsearch, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores1)*-1)

In [None]:
parameters = {
"n_neighbors": range(1, 50),
"weights": ["uniform", "distance"],
}
w_gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
w_gridsearch.fit(X_train, y_train)

In [None]:
w_gridsearch.best_params_

In [None]:
scores2 = cross_val_score(w_gridsearch, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores2)*-1)

Further Improving on kNN in scikit-learn With Bagging

In [None]:
best_k = w_gridsearch.best_params_["n_neighbors"]
best_weights = w_gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)

In [None]:
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

In [None]:
scores3 = cross_val_score(bagging_model, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores3)*-1)

In [None]:
print("Comparison of MSE of the Four Models:")

print("Arbitrary k: ", np.mean(scores)*-1)

print("GridSearchCV for k: ", np.mean(scores1)*-1)

print("GridSearchCV for k and weights: ", np.mean(scores2)*-1)

print("Bagging and GridSearchCV: ", np.mean(scores3)*-1)

We use kNN with bagging to predict the total cases.

In [None]:
bagging_model.fit(X_train, y_train.ravel())
test_preds_grid = bagging_model.predict(X_test)
cases_pred = sc_y.inverse_transform(test_preds_grid.reshape(-1,1)) 

In [None]:
y_test = sc_y.inverse_transform(y_test)
df1 = pd.DataFrame(y_test, columns = ['total_cases'])

# rounding up to nearest whole no.
cases_pred = np.rint(cases_pred)

df = pd.DataFrame(cases_pred, columns = ['pred_total_cases'])

# combining total cases test and the predicted total cases into a dataframe
all_data = pd.concat([df1, df], axis = 1)
all_data.head()

In [None]:
print("MAE of model on test data: ", metrics.mean_absolute_error(y_test, cases_pred))