# HOMEWORK: k-Nearest Neighbors

In [433]:
import os

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 100)

from sklearn import preprocessing, neighbors, grid_search, cross_validation
from sklearn import model_selection

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [434]:
df = pd.read_csv('dataset-boston.csv')

In [435]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [436]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [437]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
BLACK      506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.4 KB


In [438]:
df['MEDV'].describe()

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: MEDV, dtype: float64

The Boston dataset concerns itself with housing values in suburbs of Boston.  A description of the dataset is as follows:

- CRIM: per capita crime rate by town
- ZN: proportion of residential land zoned for lots over 25,000 sqft
- INDUS: proportion of non-retail business acres per town
- CHAS: Charles River binary/dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: nitric oxides concentration (parts per 10 million)
- RM: average number of rooms per dwelling
- AGE: proportion of owner-occupied units built prior to 1940
- DIS: weighted distances to five Boston employment centers
- RAD: index of accessibility to radial highways
- TAX: full-value property-tax rate (per ten thousands of dollars)
- PTRATIO: pupil-teacher ratio by town
- B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT: % lower status of the population
- MEDV: Median value of owner-occupied homes (in thousands of dollars)

## Question 1.  
+ Let's first categorize `MEDV` to 4 groups: Bottom 20% as Level 1, next 30% as Level 2, next 30% categorized as Level 3, and the top 20% as Level 4.  
+ Please create a new variable `MEDV_Category` that stores the level number
+ Remember the quantile function
+ Remember how to segment your pandas data frame

In [439]:
#Chopping up linear variable
L3 = df['MEDV'].quantile(0.80)
L2 = df['MEDV'].quantile(0.50)
L1 = df['MEDV'].quantile(0.20)

#df.loc["MEDV_Category"] = "0"

df.loc[df['MEDV'] > L3, 'MEDV_Category'] = "L4"
df.loc[(df['MEDV'] > L2) & (df['MEDV'] <= L3), 'MEDV_Category'] = "L3"
df.loc[(df['MEDV'] > L1) & (df['MEDV'] <= L2), 'MEDV_Category'] = "L2"
df.loc[df['MEDV'] <= L1, 'MEDV_Category'] = "L1"

### Our goal is to predict `MEDV_Category` based on `RM`, `PTRATIO`, and `LSTAT`

## Question 2.  

+ First normalize `RM`, `PTRATIO`, and `LSTAT`.  
+ By normalizing, we mean to scale each variable between 0 and 1 with the lowest value as 0 and the highest value as 1

+ Check out the documentation for MinMaxScaler()

In [441]:
min_max_scaler = preprocessing.MinMaxScaler()
df[['RM', 'PTRATIO', 'LSTAT']]=df[['RM', 'PTRATIO', 'LSTAT']].apply(lambda x: min_max_scaler.fit_transform(x), axis = 0)
#df['RM_scaler'] = min_max_scaler.fit_transform(df['RM'])
#df['PTRATIO_scaler'] = min_max_scaler.fit_transform(df['PTRATIO'])
#df['LSTAT_scaler'] = min_max_scaler.fit_transform(df['LSTAT'])



In [442]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV,MEDV_Category
0,0.00632,18.0,2.31,0,0.538,0.577505,65.2,4.09,1,296,0.287234,396.9,0.08968,24.0,L3
1,0.02731,0.0,7.07,0,0.469,0.547998,78.9,4.9671,2,242,0.553191,396.9,0.20447,21.6,L3
2,0.02729,0.0,7.07,0,0.469,0.694386,61.1,4.9671,2,242,0.553191,392.83,0.063466,34.7,L4
3,0.03237,0.0,2.18,0,0.458,0.658555,45.8,6.0622,3,222,0.648936,394.63,0.033389,33.4,L4
4,0.06905,0.0,2.18,0,0.458,0.687105,54.2,6.0622,3,222,0.648936,396.9,0.099338,36.2,L4


In [443]:
df['MEDV_Category'].value_counts()

L2    154
L3    149
L1    102
L4    101
Name: MEDV_Category, dtype: int64

In [444]:
df.shape

(506, 15)

## Question 3.  

+ Run a k-NN classifier with 5 nearest neighbors and report your misclassification error; set weights to uniform
+ Calculate your misclassification error on the training set

In [383]:
y = df.MEDV_Category
X = df[['RM', 'PTRATIO', 'LSTAT']]

In [447]:
from sklearn import neighbors, metrics
knn = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit (X,y)
print 'I. Training Set (100%): ', knn.score(X,y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
knn.fit (X_train ,y_train)
print 'II. Training Set (70%): ', knn.score(X_train ,y_train)
print 'II. Test Set (40%): ', knn.score(X_test ,y_test)


I. Training Set (100%):  0.786561264822
II. Training Set (70%):  0.759075907591
II. Test Set (40%):  0.724137931034


## Question 4. 
+ Is this error reliable? 
+ What could we do to make it better?

The error is not so much reliable because:
+ The misclassification rate changes with k (number of nearest neighbors).
+ The parameters may not be optimized, e.g. weights (we can try 'distance'), metrics (we can try different distance metrics)
+ The KNN model may perform different on a test dataset

We can make it better:
+ Finding an optimal value of k
+ Tune the parameters
+ K fold cross validation

## Question 5.  
+ Now use 10-fold cross-validation to choose the most efficient `k`

In [448]:
k = range(2,100)
params = {'n_neighbors' : k } 
#params = {'n_neighbors' : k ,'weights' : ['uniform', 'distance']} 
#params = {'n_neighbors' : k , 'metric' : ['minkowski', 'manhattan', 'euclidean', 'chebyshev']}
kf = cross_validation.KFold(len(df), n_folds = 10)
gs = grid_search.GridSearchCV(
    estimator=neighbors.KNeighborsClassifier(),
    param_grid=params,
    cv=kf,)
gs.fit(X,y)
gs.grid_scores_
print gs.best_score_
print gs.best_params_
print gs.best_estimator_

0.701581027668
{'n_neighbors': 23}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=23, p=2,
           weights='uniform')


In [417]:
#line_up, = plt.plot(k,[s[1] for s in  gs.grid_scores_[1::2]], color='red', label='distance')
#line_down, = plt.plot(k,[s[1] for s in  gs.grid_scores_[::2]], color='blue', label = 'uniform')
#plt.legend(handles=[line_up, line_down])
#plt.show()

## Question 6.  

+ Explain your findings
+ What were your best parameters?
+ What was the best k?
+ What was the best model?

Answer: 
+ K-fold cross validation was run to choose the optimal value of k. I varied the number of neighbors from 1 to 100 and the optimal value of nearest neighbors came out to be ~23. Then, I also varied the metric parameter and weight parameter using grid search. Looks like when weight = 'distance' I get 100% accuracy score (i.e. all values on diagonal of confusion matrix) that indicates overfitting. Also, with metric = 'manhattan' the accuracy are slightly lower. So, I ended up choosing the metric as 'minkowski' and weights as 'uniform'. 
+ Best parameters: {'n_neighbors': 23, 'metric': 'minkowski', 'weights': 'uniform'}
+ Best k = 23
+ Best model: KNeighborsClassifier(n_neighbors=23, weights='uniform', metric = 'minkowski')


## Question 7.  

+ Train your model with the optimal `k` you found above 
+ (don't worry if it changes from time to time - if that is the case use the one that is usually the best)

In [449]:
knn = neighbors.KNeighborsClassifier(n_neighbors=23, weights='uniform')
knn.fit (X,y)

print 'Score', knn.score(X,y)

from sklearn.metrics import confusion_matrix
print 'Confusion Matrix :', confusion_matrix(y, knn.predict(X))

Score 0.743083003953
Confusion Matrix : [[ 84  18   0   0]
 [ 21 114  19   0]
 [  3  39  99   8]
 [  0   5  17  79]]


## Question 8.  

+ After training your model with that `k`, 
+ use it to *predict* the class of a neighborhood with `RM = 2`, `PRATIO = 19`, and `LSTAT = 3.5`
+ If you are confused, check out the sklearn documentation for KNN

In [450]:
X_newdata = [[2, 19, 3.5]]
X_newdata_scaler = min_max_scaler.transform(X_newdata)
knn.predict(X_newdata_scaler)

array(['L2'], dtype=object)