In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## DAY2 MACHINE LEARNING IN PYTHON ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 4: Directory settings  ####

from pathlib import Path

# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()
# Set `main_dir` to the location of your `skillsoft-intro-to-machine-learning-in-python` folder.
main_dir = home_dir / "Desktop" / "skillsoft-intro-to-machine-learning-in-python"
# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"
# Create a plot directory to save our plots
plot_dir = main_dir / "plots"



In [None]:
#=================================================-
#### Slide 5: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 6: Loading packages  ####

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# New today - we will introduce it when we use it
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import metrics



In [None]:
#=================================================-
#### Slide 23: Loading data into Python  ####

household_poverty = pd.read_csv("costa_rica_poverty.csv")
print(household_poverty.head())



In [None]:
#=================================================-
#### Slide 25: Subsetting data  ####

costa_knn = household_poverty[["household_id","rooms","num_adults","Target"]]
print(costa_knn.head())



In [None]:
#=================================================-
#### Slide 27: The data at first glance  ####

# The first 5 rows.
print(costa_knn.head())
# The data types.
print(costa_knn.dtypes)
print(costa_knn['Target'].value_counts())



In [None]:
#=================================================-
#### Slide 28: Converting the target variable  ####

costa_knn['Target'] = np.where(costa_knn['Target'] <= 3, 'vulnerable','non_vulnerable')
print(costa_knn['Target'].head())



In [None]:
#=================================================-
#### Slide 29: Data prep: check for NAs  ####

# Check for NAs.
print(costa_knn.isnull().sum())



In [None]:
#=================================================-
#### Slide 30: Data prep: numeric variables  ####

print(costa_knn.dtypes)



In [None]:
#=================================================-
#### Slide 31: Data prep: ready for kNN  ####

print(costa_knn.Target.dtypes)
costa_knn["Target"] = np.where(costa_knn["Target"] == "non_vulnerable", True, False)
# Check class again.
print(costa_knn.Target.dtypes)



In [None]:
#=================================================-
#### Slide 33: Data prep: scaling variables  ####

# Split the data into X and y - y is categorical, so can't scale.
X = costa_knn[['rooms', 'num_adults']]
y = np.array(costa_knn['Target'])

# Scale X.
X_scaled = scale(X)
print(X_scaled[0:5])



In [None]:
#=================================================-
#### Slide 35: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 41: Train & test: small scale before n-fold  ####

# Set the seed.
np.random.seed(1)

# Split into train and test.
X_train, X_test, y_train, y_test = train_test_split(X_scaled,
y,
test_size = 0.3)




In [None]:
#=================================================-
#### Slide 44: kNN: build model  ####

# Create KNN classifier.
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data.
knn.fit(X_train, y_train)



In [None]:
#=================================================-
#### Slide 45: kNN: predict on test  ####

predictions = knn.predict(X_test)
print(predictions[0:5])



In [None]:
#=================================================-
#### Slide 46: kNN: predict on test  ####

actual_v_predicted = np.column_stack((y_test, predictions))
print(actual_v_predicted[0:5])



In [None]:
#=================================================-
#### Slide 58: Confusion matrix in Python  ####

# Confusion matrix for knn.
cm_knn5 = confusion_matrix(y_test, predictions)
print(cm_knn5)
print(round(accuracy_score(y_test, predictions),
4))



In [None]:
#=================================================-
#### Slide 59: Confusion matrix: visualize  ####

plt.imshow(cm_knn5, interpolation = 'nearest', cmap = plt.cm.Wistia)
classNames = ['Negative', 'Positive']
plt.title('Confusion Matrix - Test Data')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation = 45)
plt.yticks(tick_marks, classNames)
s = [['TN', 'FP'], ['FN', 'TP']]
for i in range(2):
for j in range(2):
plt.text(j,i, str(s[i][j]) + " = " + str(cm_knn5[i][j]))
plt.show()



In [None]:
#=================================================-
#### Slide 60: Evaluation of kNN with 5 neighbors  ####

# Create a dictionary with accuracy values for our knn model with k = 5.
model_final_dict = {'metrics': ["accuracy"],
'values':[round(accuracy_score(y_test, predictions), 4)],
'model':['knn_5']}
model_final = pd.DataFrame(data = model_final_dict)
print(model_final)



In [None]:
#=================================================-
#### Slide 62: Saving the accuracy into a pickle file  ####

pickle.dump(model_final, open("model_final.sav","wb" ))



In [None]:
#=================================================-
#### Slide 66: Plot ROC and calculate AUC  ####

# Store FPR, TPR, and threshold as variables.
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
# Store the AUC.
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

