In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## DAY3 MACHINE LEARNING IN PYTHON ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 5: Directory settings  ####

from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()
# Set `main_dir` to the location of your `skillsoft-intro-to-machine-learning-in-python` folder.
main_dir = home_dir / "Desktop" / "skillsoft-intro-to-machine-learning-in-python"
# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"
# Create a plot directory to save our plots
plot_dir = main_dir / "plots"



In [None]:
#=================================================-
#### Slide 6: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 7: Loading packages  ####

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# New today - we will introduce it when we use it
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import metrics

# Scikit-learn package for logistic regression.
from sklearn import linear_model



In [None]:
#=================================================-
#### Slide 8: Loading data into Python  ####

costa_knn_cleaned = pd.read_csv("costa_knn_cleaned.csv")
print(costa_knn_cleaned.head())



In [None]:
#=================================================-
#### Slide 9: Data prep: scaling variables  ####

# Split the data into X and y - y is categorical, so can't scale.
X = costa_knn_cleaned[['rooms', 'num_adults']]
y = np.array(costa_knn_cleaned['Target'])

# Scale X.
X_scaled = scale(X)
print(X_scaled[0:5])



In [None]:
#=================================================-
#### Slide 10: Train & test: small scale before n-fold  ####

# Set the seed.
np.random.seed(1)

# Split into train and test.
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
y,
test_size = 0.3)



In [None]:
#=================================================-
#### Slide 15: Cross-validation for optimal accuracy  ####

# Train model with CV of 5.
knn = KNeighborsClassifier(n_neighbors = 5)
cv_scores = cross_val_score(knn, X_scaled, y, cv = 5)



In [None]:
#=================================================-
#### Slide 16: Cross-validation for optimal accuracy  ####

# Print each cv score (accuracy) and average them.
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))
mean = np.mean(cv_scores)
print("Optimal cv score is:", round(mean, 4))



In [None]:
#=================================================-
#### Slide 17: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 28: Finding optimal k - GridSearchCV  ####

# Define the parameter values that should be searched.
k_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched by building a Python dictionary.
# key: parameter name
# value: list of values that should be searched for that parameter
# single key-value pair for param_grid
param_grid = dict(n_neighbors = k_range)
print(param_grid)

# Instantiate the grid using our original model - knn with k = 3.
grid = GridSearchCV(knn, param_grid, cv = 10, scoring = 'accuracy')




In [None]:
#=================================================-
#### Slide 29: Finding optimal k - GridSearchCV  ####

# Fit the grid with data.
grid.fit(X_scaled, y)



In [None]:
#=================================================-
#### Slide 30: Finding optimal k - view results  ####

# View the complete results (list of named tuples).
print(grid.cv_results_['mean_test_score'])



In [None]:
#=================================================-
#### Slide 31: Finding optimal k  ####

# Create a list of the mean scores only by using a list comprehension to loop through grid.cv_results_.
grid_mean_scores = [result for result in grid.cv_results_['mean_test_score']]
print(grid_mean_scores)



In [None]:
#=================================================-
#### Slide 32: Finding optimal k - plot  ####

# Plot the results.
plt.plot(k_range, grid_mean_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')



In [None]:
#=================================================-
#### Slide 33: Define and examine the optimized model   ####

# Single best score achieved across all params (k).
print(grid.best_score_)
grid_score = grid.best_score_

# Dictionary containing the parameters (k) used to generate that score.
print(grid.best_params_)

# Actual model object fit with those best parameters.
# Shows default parameters that we did not specify.
print(grid.best_estimator_)



In [None]:
#=================================================-
#### Slide 34: Add GridSearchCV score to the final scores  ####

model_final = pickle.load(open("model_final.sav","rb" ))
model_final = model_final.append({'metrics' : "accuracy" ,
'values' : round(grid_score, 4),
'model':'knn_GridSearchCV' } ,
ignore_index = True)
print(model_final)



In [None]:
#=================================================-
#### Slide 35: Optimal model and final thoughts  ####

knn_best = grid.best_estimator_

# Check accuracy of our model on the test data.
print(knn_best.score(X_test, y_test))
knn_champ = knn_best.score(X_test, y_test)



In [None]:
#=================================================-
#### Slide 36: Model champion dataframe  ####

# Add this final model champion to our dataframe.
model_final = model_final.append({'metrics' : "accuracy" ,
'values' : round(knn_champ, 4),
'model':'knn_29' } ,
ignore_index = True)
print(model_final)

pickle.dump(model_final, open("model_final.sav","wb" ))

