# HOMEWORK: k-Nearest Neighbors

In [1]:
import os

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 100)

from sklearn import preprocessing, neighbors, grid_search, cross_validation
from sklearn import model_selection

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')



In [2]:
df = pd.read_csv('../../DS-SF-32/lessons/lesson-8/dataset-boston.csv')

In [3]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


The Boston dataset concerns itself with housing values in suburbs of Boston.  A description of the dataset is as follows:

- CRIM: per capita crime rate by town
- ZN: proportion of residential land zoned for lots over 25,000 sqft
- INDUS: proportion of non-retail business acres per town
- CHAS: Charles River binary/dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: nitric oxides concentration (parts per 10 million)
- RM: average number of rooms per dwelling
- AGE: proportion of owner-occupied units built prior to 1940
- DIS: weighted distances to five Boston employment centers
- RAD: index of accessibility to radial highways
- TAX: full-value property-tax rate (per ten thousands of dollars)
- PTRATIO: pupil-teacher ratio by town
- B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT: % lower status of the population
- MEDV: Median value of owner-occupied homes (in thousands of dollars)

## Question 1.  
+ Let's first categorize `MEDV` to 4 groups: Bottom 20% as Level 1, next 30% as Level 2, next 30% categorized as Level 3, and the top 20% as Level 4.  
+ Please create a new variable `MEDV_Category` that stores the level number
+ Remember the quantile function
+ Remember how to segment your pandas data frame

In [4]:
# TODO
col = 'MEDV'
lvl4_lowbound = df[col].quantile(0.8)  
lvl3_lowbound = df[col].quantile(0.5)  
lvl2_lowbound = df[col].quantile(0.2)

print lvl4_lowbound, lvl3_lowbound, lvl2_lowbound

28.2 21.2 15.3


In [5]:
medv_cat = 'MEDV_Category' 
df.loc[medv_cat] = 0

df.loc[df[col] > lvl4_lowbound, medv_cat] = 4
df.loc[df[col] <= lvl4_lowbound, medv_cat] = 3
df.loc[df[col] <= lvl3_lowbound, medv_cat] = 2
df.loc[df[col] <= lvl2_lowbound, medv_cat] = 1

df[[col, medv_cat]].head()


Unnamed: 0,MEDV,MEDV_Category
0,24.0,3.0
1,21.6,3.0
2,34.7,4.0
3,33.4,4.0
4,36.2,4.0


### Our goal is to predict `MEDV_Category` based on `RM`, `PTRATIO`, and `LSTAT`

## Question 2.  

+ First normalize `RM`, `PTRATIO`, and `LSTAT`.  
+ By normalizing, we mean to scale each variable between 0 and 1 with the lowest value as 0 and the highest value as 1

+ Check out the documentation for MinMaxScaler()

In [33]:
# TODO
from sklearn.preprocessing import MinMaxScaler

for col in ['RM', 'PTRATIO', 'LSTAT']:
    print col, df[col].min(), df[col].max()


RM 0.0 8.78
PTRATIO 0.0 22.0
LSTAT 0.0 37.97


In [35]:

scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
rm_minmax = scaler.fit_transform(df['RM'][:])    
# ptratio_minmax = scaler.fit_transform(df['PTRATIO'])
# lstat_minmax = scaler.fit_transform(df['LSTAT'])
rm_minmax.shape



(508,)

In [36]:
df2 = df.copy()
df2['RM_Normalized'] = 0
df2.columns

Index([u'CRIM', u'ZN', u'INDUS', u'CHAS', u'NOX', u'RM', u'AGE', u'DIS',
       u'RAD', u'TAX', u'PTRATIO', u'BLACK', u'LSTAT', u'MEDV',
       u'MEDV_Category', u'RM_Normalized'],
      dtype='object')

In [37]:
print df2['RM_Normalized'].shape
print rm_minmax.shape

# admit.loc[admit.admit == 1, 'gpa'] += 1
# print df.loc['RM_NORMALIZED'].shape
# rm_minmax
# print rm_minmax

# rm_reshape = (df['RM']).reshape(0,1)

(508,)
(508,)


In [42]:
df2.loc[df2['RM_Normalized'] == 0, 'RM_Normalized'] = rm_minmax

In [43]:
df2.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BLACK,LSTAT,MEDV,MEDV_Category,RM_Normalized
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,3.0,0.748861
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,3.0,0.731321
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,4.0,0.818337
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,4.0,0.797039
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,4.0,0.814009


## Question 3.  

+ Run a k-NN classifier with 5 nearest neighbors and report your misclassification error; set weights to uniform
+ Calculate your misclassification error on the training set

In [65]:
X_cols = ['RM_Normalized']

In [60]:
y=df['MEDV_Category'].values
# y.head()

In [62]:
y[:5]

array([ 3.,  3.,  4.,  4.,  4.])

In [63]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [66]:
trainX, testX, trainY, testY = train_test_split(df[X_cols], y, train_size=0.75)
print trainX.shape, testX.shape
print trainY.shape, testY.shape

(381, 1) (127, 1)
(381,) (127,)


In [72]:
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')

knn_model = knn.fit(df2[X_cols], y)
knn_predict = knn_model.predict(df2[X_cols])
knn_score = knn_model.score(df2[X_cols], y)
knn_score

0.69094488188976377

Answer: TODO

## Question 4. 
+ Is this error reliable? 
+ What could we do to make it better?

Answer: TODO

## Question 5.  
+ Now use 10-fold cross-validation to choose the most efficient `k`

In [None]:
# TODO (from lesson-7-and-8)
estimator = GridSearchCV(knn2, search_parameters, cv=5, verbose=1, n_jobs=4)

# Fit the data from our train_test_split
results = estimator.fit(trainX, trainY)

## Question 6.  

+ Explain your findings
+ What were your best parameters?
+ What was the best k?
+ What was the best model?

Answer: TODO

## Question 7.  

+ Train your model with the optimal `k` you found above 
+ (don't worry if it changes from time to time - if that is the case use the one that is usually the best)

In [None]:
# TODO

Answer: TODO

## Question 8.  

+ After training your model with that `k`, 
+ use it to *predict* the class of a neighborhood with `RM = 2`, `PRATIO = 19`, and `LSTAT = 3.5`
+ If you are confused, check out the sklearn documentation for KNN

In [None]:
# TODO

Answer: TODO