# K-Nearest Neighbors

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
%matplotlib inline

In [2]:
# read in data 
df = pd.read_csv('clean_data/2019-fec-contr-census.csv', index_col=0)

In [3]:
# set target column 
y = df.target

In [20]:
# set predictor columns
df.contbr_zip = df.contbr_zip.astype(str)
X_feats = ['contbr_zip',
           'converted_date',
           'contb_receipt_amt']
X = pd.get_dummies(df[X_feats],
                   drop_first=True)
print(X.shape)
X.head()

(11502, 23)


Unnamed: 0,converted_date,contb_receipt_amt,contbr_zip_20002,contbr_zip_20003,contbr_zip_20004,contbr_zip_20005,contbr_zip_20006,contbr_zip_20007,contbr_zip_20008,contbr_zip_20009,...,contbr_zip_20015,contbr_zip_20016,contbr_zip_20017,contbr_zip_20018,contbr_zip_20019,contbr_zip_20020,contbr_zip_20024,contbr_zip_20032,contbr_zip_20036,contbr_zip_20037
0,201906,100.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,201906,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,201903,3.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

In [22]:
# scale train data 
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_train.shape

(9201, 23)

In [23]:
knn = KNeighborsClassifier(n_neighbors = 3, 
                           weights = 'uniform',
                           algorithm = 'auto',
                           leaf_size = 30, 
                           p = 2, 
                           metric = 'minkowski')

In [25]:
# fit knn to scaled train data
knn.fit(scaled_X_train, y_train)
y_hat_train = knn.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)

# print KNN train score
print('KNN:', score)

KNN: 0.597543745245082


In [26]:
# scale all X data for cross-validation 
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X.shape

(11502, 23)

In [27]:
# set param grid for GridSearchCV
param_grid = {'n_neighbors': [20, 40, 60],
              'weights': ['distance'],
              'algorithm': ['auto'], 
              'leaf_size': [3, 5],
              'p':[1]
             }

# call GridSearchCV with knn estimator 
clf = GridSearchCV(
    knn, 
    param_grid, 
    n_jobs=-1,
    scoring='accuracy',
    return_train_score=True,
    verbose=1,
    cv=3
    )

# fit model to scaled data 
clf.fit(scaled_X, y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   47.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=3, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'], 'leaf_size': [3, 5],
                         'n_neighbors': [20, 40, 60], 'p': [1],
                         'weights': ['distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=1)

In [28]:
# print best params and score
print(clf.best_params_)
clf.best_score_

{'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}


0.4194053208137715

In [29]:
# run split test data using knn and best params
knn = KNeighborsClassifier(n_neighbors = 20, 
                           weights = 'distance',
                           algorithm = 'auto',
                           leaf_size = 5, 
                           p = 1, 
                           metric = 'minkowski')

scaled_X_test = scaler.transform(X_test)
knn.fit(scaled_X_train, y_train)
y_hat_test = knn.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('KNN:', score)

KNN: 0.46631899174272057
