In [None]:
#MNIST data set for classifying numbers by pixels in an image
# Reminder that the data is in the form of a 28 by 28 matrix which represents an image containing a number
# The matrix is presented in vector form where each row corresponds to a number with 784 dimensions (28 x 28)

def get_data(limit=None):

    # Reading the data
    df = pd.read_csv('train.csv')

    # Convert to a numpy array and shuffle for randomness
    data = df.as_matrix()
    np.random.shuffle(data)

    # Seperate the pixel data from the label associated with it
    X = data[:, 1:] / 255
    Y = data[:, 0]
    
    # If there's a limit on how much data we want to use
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
        
    return X,Y

In [None]:
# Case where kNN fails: Alternating points (train accuracy of 0)

def get_data_failure():
    
    # 64 data points
    width = 8
    height = 8
    N = width * height
    
    # Create a zero matrix with 64 rows and 2 columns
    X = np.zeros((N,2))
    Y = np.zeros(N)
    
    # n is the index for each point and start_t is the alternating value (0,1)
    n = 0
    start_t = 0
    
    # for each value (i,j), classification (y) should alternate 0 and 1
    for i in range(width):
        t = start_t 
        
        for j in range(height):
            X[n] = [i,j] # creates a N point grid
            Y[n] = t     # classifies each point on grid as 0 or 1
            n += 1
            t = (t + 1) % 2 # alternates point value between columns
            
        start_t = (start_t + 1) % 2 # alternates value between rows
    return X, Y

In [None]:
# Looking at the XOR Problem (data is not linearly separable but kNN still works)
# 00 -> 0, 11 -> 0, 01 -> 1, 10 -> 1

# Generating ranges of values that do and do not match 
# 
def get_xor():
    X = np.zeros((200, 2)) # 200 rows, 2 columns
    
    # random are between 0-1, divide by 2 to get 0-0.5, add 0.5 to get 0.5-1
    X[:50] = np.random.random((50,2)) / 2 + 0.5  #(0.5-1, 0.5-1)
    X[50:100] = np.random.random((50,2)) / 2     # (0-0.5, 0-0.5)
    X[100:150] = np.random.random((50,2)) / 2 + np.array([[0,0.5]])  # (0-0.5, 0.5-1)
    X[150:] = np.random.random((50,2)) / 2 + np.array([[0.5,0]])     # (0.5-1, 0-0.5)
    
    Y = np.array([0]*100 + [1]*100)
    return X, Y


In [None]:
# Donut Problem (data still isn't linearly seperable)
# Small circle inside a bigger circle

def get_donut():
    N = 200
    R_inner = 5
    R_outer = 10

    # distance from origin is radius + random normal
    # angle theta is N/2 uniformly distributed values between (0, 2pi)
    R1 = np.random.randn(int(N/2)) + R_inner
    theta = 2 * np.pi * np.random.random(int(N/2))
    
    # Converts from polar to cartesian coordinates 
    # x = rcos(theta), y = rsin(theta)
    # The transpose puts it into coordinate form (swapping indices)
    X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T

    # Same as above
    R2 = np.random.randn(int(N/2)) + R_outer
    theta = 2 * np.pi * np.random.random(int(N/2))
    X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T

    # Combine the data into singular arrays
    X = np.concatenate([X_inner, X_outer])
    Y = np.array([0]*(int(N/2)) + [1]*(int(N/2))) # Inners are 0, outers are 1
    return X, Y