# The One Goal For Today

Understand how normalization first can lead to better or more efficient clustering and classification models.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import scipy

# Load and Look at Your Data

The data set we wil be analyzing is the dataset of car logos from https://github.com/GeneralBlockchain/vehicle-logos-dataset. I converted each logo to greyscale and downscaled them to a consistent size. I also converted the dependent variable (manufacturer name) to an int; it is the last column.

First we load the data.

In [None]:
# these will be our columns
columns = ["price", "year", "manufacturer", "model", "condition", "fuel", "odometer", "title_status", "transmission"]
# this will contain our converters
colValues = {}

# first we load our data as strings so we can define the converters
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), skip_header=1, dtype=str, encoding='utf-8'))  

# make a list of the unique values in each column of our data
for colIndex in range(data.shape[1]):
    colValues[colIndex] = np.unique(data[:, colIndex]).tolist()
    print(colIndex, colValues[colIndex])

# map values to their indices in the list of unique values
def converter(x, colIndex):
    return colValues[colIndex].index(x)
    
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), converters={3: lambda x: converter(x, 2), 4: lambda x: converter(x, 3), 5: lambda x: converter(x, 4), 7: lambda x: converter(x,5), 9: lambda x: converter(x, 7), 11: lambda x: converter(x, 8)}, skip_header=1, dtype=int, encoding='utf-8'))  

Then we get summary statistics.

In [None]:
def getSummaryStatistics(data):
    print("min, max, mean, std per variable")
    return pd.DataFrame([data.min(axis=0), data.max(axis=0), data.mean(axis=0), data.std(axis=0)])

def getShapeType(data):
    print("shape")
    return (data.shape, data.dtype)

print(getSummaryStatistics(data))
print(getShapeType(data))

# Split the data

If we are doing supervised machine learning, we split the data into train and test. 

In [None]:
# shuffle the data
np.random.shuffle(data)

# split the data into train and test
(train, test) = np.split(data, [int(len(data) / 10 * 8)])
print("train, test: ", train.shape, test.shape)

Strip off the dependent variable (the labels, the classes). Let's go with trying to predict the car's **drive train**. That's the last variable.

In [None]:
y_train = train[:, -1]
x_train = train[:, 0:-1]
y_test = test[:, -1]
x_test = test[:, 0:-1]

# Normalization Review

Here we implement max-min global, max-min local, z-score and center. This code comes from day 20.

This code you can use as a **tool**.

**If you are using separate training and test data, you want to normalize to the mean (min, max, std) of the _training data_.**

In [None]:
def normalize(data, min, max, mean, std, method='center'):
    if method == 'center':
        return data - mean
    elif method == 'max-min-global':
        return (data - min) / (max - min)
    elif method == 'max-min-local':
        return (data - min) / (max - min)
    elif method == 'zscore':
        return (data - mean) / std
    else:
        raise Exception("I can't do " + method)

Let's try it!

**When you are doing supervised machine learning, you always want to normalize using statistics (mean, min, max) from your training data**.

In [None]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)
normalized_train = normalize(x_train, min_l, max_l, mean, std, method='max-min-local')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='center')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='max-min-global')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='zscore')

# kNN Review

The code below comes from day 24.

You can use this code as a **tool**.

In [None]:
# copy over euclidean distance from Friday
def distance(a, b):
    return np.sqrt(np.sum((a-b)**2))

# "fits" a model to the data
def fit_knn(data, labels, k):
    assert len(data) == len(labels)
    # "store" or return the model which is the combination of data, labels and k
    # see predict_one_knn for what it should look like
    return (data, labels, k)

# predict the label for one datapoint
def predict_one_knn(element, model):
    training_data = model[0]
    labels = model[1]
    k = model[2]
    # let's look up this argpartition thing
    neighbors_by_distance = np.argpartition([distance(element, datapoint) for datapoint in training_data], k)
    neighbor_labels = [labels[neighbors_by_distance[x]] for x in range(k)]
    vals, counts = np.unique(neighbor_labels, return_counts=True)
    # print("neighbor labels by counts: ", vals, counts)
    return vals[np.argwhere(counts == np.max(counts))][0,0]

# predict the label for a set of data points
def predict_knn(data, model):
    return np.array([predict_one_knn(datapoint, model) for datapoint in data])
    
# calculate accuracy given actual labels y and predicted labels yhat
def accuracy(y, yhat):
    assert len(y) == len(yhat)
    diffs = y == yhat
    vals, counts = np.unique(diffs, return_counts=True)
    return (counts[np.where(vals == True)] / (np.sum(counts)))[0]

# score a model using test data
def score(model, testing_data, test_labels):
    predicted_labels = predict_knn(testing_data, model)
    return accuracy(test_labels, predicted_labels)

# Impact of normalization on kNN

Fill in this table.
1. Try all the types of normalization plus kNN classification. Use a reasonable value for $k$ in kNN classification, like 5.
2. Try at least one type of normalization (centering!) plus PCA plus kNN classification. Use the same value of $k$ for kNN classification as you have so far. Pick a number of principal components that lets you keep at least 80% of the cumulative sum of variance.

| Normalization | PCA (None or k) | kNN k | Accuracy | Time |
| ------------- | --------------- | --------- | ---------------- | ---- |
| None | None | ?? | | |
| Centering | None | ?? | | |
| Max-min global | None | ?? | | |
| Max-min local | None | ?? | | |
| Z-score | None | ?? | | |
| ?? | ?? | ?? | | |




In [None]:
k = 7

In [None]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='center')
model = fit_knn(normalized_train, y_train, k)
normalized_test = normalize(x_test,  min_g, max_g, mean, std, method='center')
score(model, normalized_test, y_test)

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='max-min-global')
model = fit_knn(normalized_train, y_train, k)
normalized_test = normalize(x_test,  min_g, max_g, mean, std, method='max-min-global')
score(model, normalized_test, y_test)

In [None]:
%time 

normalized_train = normalize(x_train, min_l, max_l, mean, std, method='max-min-local')
model = fit_knn(normalized_train, y_train, k)
normalized_test = normalize(x_test,  min_l, max_l, mean, std, method='max-min-local')
score(model, normalized_test, y_test)

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='zscore')
model = fit_knn(normalized_train, y_train, k)
normalized_test = normalize(x_test,  min_g, max_g, mean, std, method='zscore')
score(model, normalized_test, y_test)

In [None]:
%time 

normalized_train = x_train
model = fit_knn(normalized_train, y_train, k)
normalized_test = x_test
score(model, normalized_test, y_test)

**Bonus**: Now think about PCA. If we had a dataset with 1000 independent variables (like our car logo data), what do you think might be the impact of PCA-first on accuracy, and on time?