## Import Python modules

In [2]:
import numpy as np # For scientific computing
import matplotlib.pyplot as plt # For generating plots

# Configure matplotlib to embed the plots in the output cells of the present notebook
# Style to tell the notebook just print whatever graph we are producing in line with the output
%matplotlib notebook

## Create labeled training set, containing points and labels(colors)

In [3]:
# Define numpy array of two dimessional points
X_train = np.array([[1,1], [2,2.5], [3,1.2], [5.5,6.3], [6,9], [7,6]])

# Define a Python built-in list (i.e., array) of strings
Y_train = ['red', 'red', 'red', 'blue', 'blue', 'blue']

In [4]:
print(X_train[5,0]) # Extract the 0th coordinate of the 5th point in the array
print(X_train[5,1]) # Extract the 1st coordinate of the 5th point in the array

7.0
6.0


###  A slicing syntax that allows us to extract multiple elements in an array at once

In [5]:
print(X_train[:, 0]) # Extract the 1st coordinate (indexed by 0) of all elements (:) in the array X_train
print(X_train[:, 1]) # Extract the 2nd coordinate (indexed by 1) of all elements (:) in the array X_train

[ 1.   2.   3.   5.5  6.   7. ]
[ 1.   2.5  1.2  6.3  9.   6. ]


### Plot the training set( points with their colors as their labels)

In [6]:
plt.figure() # Define a new figure
plt.scatter(X_train[:,0], X_train[:,1], s = 170, color = Y_train[:]) # Plot points with Python slicing syntax
plt.show() # Display plot

<IPython.core.display.Javascript object>

In [7]:
# Create and plot a test point
X_test = np.array([3,4])

plt.figure()
plt.scatter(X_train[:,0], X_train[:,1], s = 170, color = Y_train[:])
plt.scatter(X_test[0], X_test[1], s = 170, color = 'green')
plt.show()

<IPython.core.display.Javascript object>

In [8]:
# Compute the distance by subtracting the values of each coordinate of two points x and y
# Square them, taking their sum, and then taking the square root
# x = [1, 1]
# y = [3, 4]
# x - y = [-2, -3]
# (x - y)**2 = [4, 9]
# np.sum((x - y)**2) = 13
# np.sqrt(np.sum((x - y)**2)) = 3.60 ???

In [9]:
# To run the Nearest Neighbor Classifier, we need to first define a distance function
def dist(x, y):
    return np.sqrt(np.sum((x - y)**2))
# np.sqrt and np.sum are numpy functions to work with numpy arrays

### Computing the distance of each point (in our training data) to the test point

In [15]:
num = len(X_train) # Compute the number of points in X_train
print(num)
distance = np.zeros(num) # Initialize a numpy arrays of zeros
print(distance)

for i in range(num):
    print(i)
    distance[i] = dist(X_train[i], X_test) # Compute distance from X_train[i] to X_test
    
print(distance) # An array of distances

6
[ 0.  0.  0.  0.  0.  0.]
0
1
2
3
4
5
[ 3.60555128  1.80277564  2.8         3.39705755  5.83095189  4.47213595]


In [11]:
# Alternatively, use a "vectorization" syntax to apply a distance formula to arrays directly
distance = np.sqrt(np.sum((X_train - X_test)**2, axis = 1)) # Vectorization syntax
print(distance)

[ 3.60555128  1.80277564  2.8         3.39705755  5.83095189  4.47213595]


In [13]:
min_index = np.argmin(distance) # Get the index with smallest distance
print(min_index)
print(Y_train[min_index])

1
red
