**Classification Warmup**

1. Use pydataset to load the voteincome dataset.

    - from pydataset import data

    - data('voteincome', show_doc=True)

    - data('voteincome')
    
2. Drop the state and year columns.

3. Split the data into train and test datasets. We will be predicting whether or not someone votes based on the the remaining features.

4. Fit a k-neighbors classifier on the training data. Use 4 for your number of neighbors. How accurate is your model? How does it perform on the test data?

5. Try our these values for k: 1, 2, 3, and 4. Which gives the best accuracy? Which gives the best accuracy on the test data set?

6. View the classification report for your best model.

    - from sklearn.metrics import classifciation_report

    - print(classification_report(y, predictions))
    
7. Within our problem space, what does accuracy mean? Precision? Recall?

In [32]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = data('voteincome', show_doc=True)

voteincome

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Sample Turnout and Demographic Data from the 2000 Current Population Survey

### Description

This data set contains turnout and demographic data from a sample of
respondents to the 2000 Current Population Survey (CPS). The states
represented are South Carolina and Arkansas. The data represent only a sample
and results from this example should not be used in publication.

### Usage

    data(voteincome)

### Format

A data frame containing 7 variables ("state", "year", "vote", "income",
"education", "age", "female") and 1500 observations.

`state`

a factor variable with levels equal to "AR" (Arkansas) and "SC" (South
Carolina)

`year`

an integer vector

`vote`

an integer vector taking on values "1" (Voted) and "0" (Did Not Vote)

`income`

an integer vector ranging from "4" (Less than \$5000) to "17" (Greater than
\$75000) denoting family income. See the CPS codebook for more info

In [3]:
df = data('voteincome')

In [4]:
df

Unnamed: 0,state,year,vote,income,education,age,female
1,AR,2000,1,9,2,73,0
2,AR,2000,1,11,2,24,0
3,AR,2000,0,12,2,24,1
4,AR,2000,1,16,4,40,0
5,AR,2000,1,10,4,85,1
6,AR,2000,1,12,3,78,1
7,AR,2000,0,14,4,31,0
8,AR,2000,1,10,1,75,0
9,AR,2000,1,17,2,54,0
10,AR,2000,1,8,1,78,0


In [5]:
df.shape

(1500, 7)

In [6]:
df.describe()

Unnamed: 0,year,vote,income,education,age,female
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,2000.0,0.855333,12.464,2.651333,49.261333,0.559333
std,0.0,0.351882,3.915643,1.021009,17.471134,0.496633
min,2000.0,0.0,4.0,1.0,18.0,0.0
25%,2000.0,1.0,9.0,2.0,36.0,0.0
50%,2000.0,1.0,13.0,3.0,49.0,1.0
75%,2000.0,1.0,16.0,4.0,62.0,1.0
max,2000.0,1.0,17.0,4.0,85.0,1.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 1 to 1500
Data columns (total 7 columns):
state        1500 non-null object
year         1500 non-null int64
vote         1500 non-null int64
income       1500 non-null int64
education    1500 non-null int64
age          1500 non-null int64
female       1500 non-null int64
dtypes: int64(6), object(1)
memory usage: 93.8+ KB


In [10]:
df.columns

Index(['state', 'year', 'vote', 'income', 'education', 'age', 'female'], dtype='object')

In [13]:
df.income.mean()

12.464

In [14]:
df.age.mean()

49.26133333333333

2. Drop the state and year columns.

In [15]:
df.drop(columns=(['state', 'year']), inplace=True)

In [16]:
df.head()

Unnamed: 0,vote,income,education,age,female
1,1,9,2,73,0
2,1,11,2,24,0
3,0,12,2,24,1
4,1,16,4,40,0
5,1,10,4,85,1


3. Split the data into train and test datasets. We will be predicting whether or not someone votes based on the the remaining features.

In [24]:
X = df[['income', 'education', 'age', 'female']]
y = df[['vote']]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=123)

In [29]:
X_train.head()

Unnamed: 0,income,education,age,female
892,17,4,38,1
1386,15,3,47,1
1187,17,2,85,1
490,11,3,76,1
1237,12,2,19,0


In [30]:
y_train.head()

Unnamed: 0,vote
892,1
1386,1
1187,0
490,1
1237,0


4. Fit a k-neighbors classifier on the training data. Use 4 for your number of neighbors. How accurate is your model? How does it perform on the test data?

In [33]:
knn = KNeighborsClassifier(n_neighbors=4, weights='uniform')
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [35]:
y_pred = knn.predict(X_train)
y_pred

array([1, 1, 0, ..., 1, 1, 1])

In [37]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba

array([[0. , 1. ],
       [0. , 1. ],
       [0.5, 0.5],
       ...,
       [0. , 1. ],
       [0. , 1. ],
       [0. , 1. ]])

In [38]:
knn.score(X_train, y_train)

0.8819047619047619

In [42]:
confusion_matrix(y_train, y_pred)

array([[ 99,  55],
       [ 69, 827]])

In [46]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.64      0.61       154
           1       0.94      0.92      0.93       896

    accuracy                           0.88      1050
   macro avg       0.76      0.78      0.77      1050
weighted avg       0.89      0.88      0.88      1050



In [51]:
y_pred_test = knn.predict(X_test)
y_pred_test

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [56]:
y_pred_proba_test = knn.predict_proba(X_test)
y_pred_proba_test

array([[0.  , 1.  ],
       [0.25, 0.75],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.75, 0.25],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.  , 1.  ],
       [0.25, 0.75],
       [0.75, 0.25],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.5 , 0.5 ],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.  , 1.  ],
       [0.5 , 0.5 ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.  , 1.  ],
       [0.75, 0.25],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.5 , 0.5 ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.25, 0.75],
       [0.5 , 0.5 ],
       [0.5 ,

In [57]:
knn.score(X_test, y_test)

0.8133333333333334

In [61]:
confusion_matrix(y_test, y_pred_test)

array([[ 17,  46],
       [ 38, 349]])

In [62]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.31      0.27      0.29        63
           1       0.88      0.90      0.89       387

    accuracy                           0.81       450
   macro avg       0.60      0.59      0.59       450
weighted avg       0.80      0.81      0.81       450



In [98]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score()

In [None]:
# N_neighbors = 5 is a good model, not overfitted with a good scorea

5. Try our these values for k: 1, 2, 3, and 4. Which gives the best accuracy? Which gives the best accuracy on the test data set?

In [None]:
for n in [1, 2, 3, 4]:
    knn - KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    print("n", n)
    print()

In [63]:
knn = KNeighborsClassifier(n_neighbors= 1)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [64]:
y_pred = knn.predict(X_train)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [66]:
y_pred_proba = knn.predict_proba(X_train)
y_pred_proba

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [67]:
knn.score(X_train, y_train)

0.9780952380952381

In [68]:
confusion_matrix(y_train, y_pred)

array([[144,  10],
       [ 13, 883]])

In [69]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       154
           1       0.99      0.99      0.99       896

    accuracy                           0.98      1050
   macro avg       0.95      0.96      0.96      1050
weighted avg       0.98      0.98      0.98      1050



In [70]:
y_pred_test = knn.predict(X_test)
y_pred_test

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [73]:
y_pred_proba_test = knn.predict_proba(X_test)
y_pred_proba_test

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.

In [74]:
confusion_matrix(y_test, y_pred_test)

array([[ 25,  38],
       [ 34, 353]])

In [75]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.42      0.40      0.41        63
           1       0.90      0.91      0.91       387

    accuracy                           0.84       450
   macro avg       0.66      0.65      0.66       450
weighted avg       0.84      0.84      0.84       450



In [76]:
knn = KNeighborsClassifier(n_neighbors= 2)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [80]:
y_pred = knn.predict(X_train)
y_pred

array([1, 1, 0, ..., 1, 1, 1])

In [82]:
y_pred_proba = knn.predict(X_train)
y_pred_proba

array([1, 1, 0, ..., 1, 1, 1])

In [83]:
knn.score(X_train, y_pred)

1.0

In [84]:
confusion_matrix(y_train, y_pred)

array([[153,   1],
       [ 74, 822]])

In [85]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.99      0.80       154
           1       1.00      0.92      0.96       896

    accuracy                           0.93      1050
   macro avg       0.84      0.96      0.88      1050
weighted avg       0.95      0.93      0.93      1050



In [87]:
y_pred_test = knn.predict(X_test)
y_pred_test

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [88]:
y_pred_proba_test = knn.predict_proba(X_test)
y_pred_proba

array([1, 1, 0, ..., 1, 1, 1])

In [95]:
knn.score(X_test, y_pred_test)

1.0

In [96]:
confusion_matrix(y_test, y_pred_test)

array([[ 36,  27],
       [ 69, 318]])

In [97]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.34      0.57      0.43        63
           1       0.92      0.82      0.87       387

    accuracy                           0.79       450
   macro avg       0.63      0.70      0.65       450
weighted avg       0.84      0.79      0.81       450



In [None]:
knn = KNeighborsClassifier(n_neighbors= 3)

In [102]:
from sklearn import __version__

In [103]:
__version__

'0.21.2'

In [105]:
def test_score(n_neighbors):
    model = KNeighborsClssifier(n_neighobrs=n_neighbors)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)