In [None]:
# Data available for download https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records

How well can k Nearest Neighbors predict the party of a given congressional voter?

In [71]:
# Import neccessary packages.
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [82]:
# Begin data cleaning.

# Create a column_names variable since they cannot be inferred from the file of interest.
columns_names = ['party', 'water', 'budget', 'physician', 'salvador', 'religious', 'satellite', 'aid',  'missile',
                 'immigration', 'synfuels', 'education', 'superfund', 'crime', 'duty_free', 'eaa_rsa']

# Pass column_names explicitly to names param and specify index_col is False.
df = pd.read_csv('datasets/house-votes-84.csv', index_col=False, header=None, names=columns_names)

df.head()

Unnamed: 0,party,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free,eaa_rsa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y


In [83]:
# Notably we have missing values encoded as '?', and our votes need to be transformed from 'y' or 'n' to 1 or 0.

# Convert '?' to NaN
df[df == '?'] = np.nan

# Let's check to see how many missing values we have in each column.
print(df.isnull().sum())

party           0
water          12
budget         48
physician      11
salvador       11
religious      15
satellite      11
aid            14
missile        15
immigration    22
synfuels        7
education      21
superfund      31
crime          25
duty_free      17
eaa_rsa        28
dtype: int64


In [84]:
# Print shape of original DataFrame
print("The original DataFrame has shape: {}".format(df.shape))

# Drop missing values and print shape of new DataFrame
df = df.dropna()

# Print shape of new DataFrame
print("After dropping all rows with missing values the dataframe has shape: {}".format(df.shape))

The original DataFrame has shape: (435, 16)
After dropping all rows with missing values the dataframe has shape: (281, 16)


In [85]:
# Replace 'y' and 'n' with 1s and 0s.
df[df == 'y'] = 1
df[df == 'n'] = 0

df.head()

Unnamed: 0,party,water,budget,physician,salvador,religious,satellite,aid,missile,immigration,synfuels,education,superfund,crime,duty_free,eaa_rsa
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0
5,democrat,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1
8,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0
19,democrat,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1
23,democrat,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1


In [86]:
# Create arrays for the features and the response variable
y = df['party'].values
X = df.drop('party', axis=1).values

# Create training and test splits.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a k-NN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the classifier to the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [87]:
# Predict the labels for the training data X
y_pred = knn.predict(X_test)

# Print the models accuracy
print("kNN score: {}%".format(knn.score(X_test, y_test) * 100))

kNN score: 94.7368421053%


In [88]:
# print the model predictions
print(y_pred)

['democrat' 'republican' 'democrat' 'republican' 'democrat' 'democrat'
 'republican' 'democrat' 'democrat' 'democrat' 'republican' 'democrat'
 'democrat' 'republican' 'republican' 'republican' 'republican' 'democrat'
 'republican' 'democrat' 'democrat' 'democrat' 'republican' 'democrat'
 'democrat' 'republican' 'democrat' 'republican' 'democrat' 'democrat'
 'republican' 'democrat' 'republican' 'republican' 'republican'
 'republican' 'democrat' 'democrat' 'republican' 'democrat' 'republican'
 'democrat' 'democrat' 'republican' 'democrat' 'democrat' 'democrat'
 'republican' 'democrat' 'republican' 'democrat' 'democrat' 'democrat'
 'democrat' 'republican' 'republican' 'republican']
