In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model, neighbors
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

# Problem 2

In [63]:
#read data into a dataframe
zip_train = pd.read_csv('zip_train.txt', header=None, delimiter = ' ')

#ad hoc changes
zip_train.drop([257], axis = 1, inplace = True)
zip_train.rename({0 : 'digit'}, axis = 1, inplace = True)

#we only care about the digits 2 and 3
zip_train = zip_train[(zip_train['digit'] == 2.0) | (zip_train['digit'] == 3.0)]

In [100]:
#import the test data
zip_test = pd.read_csv('zip_test.txt', header = None, delimiter = ' ')
#rename the digit column
zip_test.rename({0 : 'digit'}, axis = 1, inplace = True)
#filter out everything but 2 and 3
zip_test = zip_test[(zip_test['digit'] == 2.0) | (zip_test['digit'] == 3.0)]

In [103]:
#create a training x and y
train_x = zip_train.drop('digit', axis = 1)
train_y = zip_train['digit'].astype(str)

#create a testing x and y
test_x = zip_test.drop('digit', axis = 1)
test_y = zip_test['digit'].astype(str)

Linear Regression

In [65]:
#create a linear regression object
regr = linear_model.LinearRegression()

In [104]:
#train the model
regr.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [68]:
#this function converts from the numerical prediction into a categorical digit prediction
def convert(num):
    if num > 2.5:
        return 3
    else:
        return 2

In [109]:
# make predictions based on the training set
train_pred = np.array([convert(y) for y in regr.predict(train_x)])
test_pred = np.array([convert(y) for y in regr.predict(test_x)])

In [112]:
# The mean squared error
print('Training Set Predictions: ')
print('\tMean squared error: %.2f'
      % mean_squared_error(train_pred, train_y))
# The coefficient of determination: 1 is perfect prediction
print('\tCoefficient of determination: %.2f'
      % r2_score(train_pred, train_y))

# The mean squared error
print('Test Set Predictions: ')
print('\tMean squared error: %.2f'
      % mean_squared_error(test_pred, test_y))
# The coefficient of determination: 1 is perfect prediction
print('\tCoefficient of determination: %.2f'
      % r2_score(test_pred, test_y))

Training Set Predictions: 
	Mean squared error: 0.01
	Coefficient of determination: 0.98
Test Set Predictions: 
	Mean squared error: 0.04
	Coefficient of determination: 0.83


In [108]:
# This plot needs a lot of touching up, but it roughly shows that this model is pretty accurate
fig = plt.figure(num = 7)
fig = sns.swarmplot(x = train_pred, y = train_y,
                hue = train_pred['dig'])
plt.xticks(np.arange(1.75, 3.25, 0.25))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

<Figure size 432x288 with 0 Axes>

K - Nearest Neighbors

In [113]:
k_preds = {}
for num in [1, 3, 5, 7, 15]:
    klass =  neighbors.KNeighborsClassifier(n_neighbors= num)
    klass.fit(train_x, zip_train['digit'].astype(str))
    k_preds[num] = {}
    k_preds[num]['train'] = klass.predict(train_x)
    k_preds[num]['test'] = klass.predict(test_x)
    
    print('K = ', num)
    print('Training Set Predictions:')
    # The mean squared error
    print('\tMean squared error: %.2f'
          % mean_squared_error(k_preds[num]['train'], train_y))
    # The coefficient of determination: 1 is perfect prediction
    print('\tCoefficient of determination: %.2f'
          % r2_score(k_preds[num]['train'], train_y))
    print('Testing Set Predictions:')
    # The mean squared error
    print('\tMean squared error: %.2f'
          % mean_squared_error(k_preds[num]['test'], test_y))
    # The coefficient of determination: 1 is perfect prediction
    print('\tCoefficient of determination: %.2f'
          % r2_score(k_preds[num]['test'], test_y))
    print('\n')

K =  1
Training Set Predictions:
	Mean squared error: 0.00
	Coefficient of determination: 1.00
Testing Set Predictions:
	Mean squared error: 0.02
	Coefficient of determination: 0.90


K =  3
Training Set Predictions:
	Mean squared error: 0.01
	Coefficient of determination: 0.98
Testing Set Predictions:
	Mean squared error: 0.03
	Coefficient of determination: 0.88


K =  5
Training Set Predictions:
	Mean squared error: 0.01
	Coefficient of determination: 0.98
Testing Set Predictions:
	Mean squared error: 0.03
	Coefficient of determination: 0.88


K =  7
Training Set Predictions:
	Mean squared error: 0.01
	Coefficient of determination: 0.97
Testing Set Predictions:
	Mean squared error: 0.03
	Coefficient of determination: 0.87


K =  15
Training Set Predictions:
	Mean squared error: 0.01
	Coefficient of determination: 0.96
Testing Set Predictions:
	Mean squared error: 0.04
	Coefficient of determination: 0.85


