# The One Goal For Today

Understand how normalization first can lead to better or more efficient clustering and classification models.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import scipy

# Load and Look at Your Data

The data set we wil be analyzing is our usual car dataset from Craigslist. 

First we load the data.

In [None]:
# these will be our columns
columns = ["price", "year", "manufacturer", "model", "condition", "fuel", "odometer", "title_status", "transmission"]
# this will contain our converters
colValues = {}

# first we load our data as strings so we can define the converters
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), skip_header=1, dtype=str, encoding='utf-8'))  

# make a list of the unique values in each column of our data
for colIndex in range(data.shape[1]):
    colValues[colIndex] = np.unique(data[:, colIndex]).tolist()
    print(colIndex, colValues[colIndex])

# map values to their indices in the list of unique values
def converter(x, colIndex):
    return colValues[colIndex].index(x)
    
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), converters={3: lambda x: converter(x, 2), 4: lambda x: converter(x, 3), 5: lambda x: converter(x, 4), 7: lambda x: converter(x,5), 9: lambda x: converter(x, 7), 11: lambda x: converter(x, 8)}, skip_header=1, dtype=int, encoding='utf-8'))  

Then we get summary statistics.

In [None]:
def getSummaryStatistics(data):
    print("min, max, mean, std per variable")
    return pd.DataFrame([data.min(axis=0), data.max(axis=0), data.mean(axis=0), data.std(axis=0)])

def getShapeType(data):
    print("shape")
    return (data.shape, data.dtype)

print(getSummaryStatistics(data))
print(getShapeType(data))

# Split the data

If we are doing supervised machine learning, we split the data into train and test. 

However, here we are doing clustering, so we don't.

In [None]:
train = data

If we had a clear dependent variable (as we do with the car logo dataset) we'd strip it off. However, here we don't.

In [None]:
#y_train = train[:, -1]
#x_train = train[:, 0:-1]
#y_test = test[:, -1]
#x_test = test[:, 0:-1]
x_train = train

# Normalization Review

Here we implement max-min global, max-min local, z-score and center. This code comes from day 20.

This code you can use as a **tool**.

**If you are using separate training and test data, you want to normalize to the mean (min, max, std) of the _training data_.**

In [None]:
def normalize(data, min, max, mean, std, method='center'):
    if method == 'center':
        return data - mean
    elif method == 'max-min-global':
        return (data - min) / (max - min)
    elif method == 'max-min-local':
        return (data - min) / (max - min)
    elif method == 'zscore':
        return (data - mean) / std
    else:
        raise Exception("I can't do " + method)

Let's try it!

**When you are doing supervised machine learning, you always want to normalize using statistics (mean, min, max) from your training data**.

In [None]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)
normalized_train = normalize(x_train, min_l, max_l, mean, std, method='max-min-local')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='center')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='max-min-global')
# normalized_train = normalize(train, min_g, max_g, mean, std, method='zscore')

# K-means Clustering Review

The code below comes from day 22.

You can use this code as a **tool**.

In [None]:
# Euclidean distance
def distance(a, b):
    subtracted = a-b
    return np.sqrt(np.dot(subtracted.T, subtracted))

# Calculate the distance from each data point to each centroid
def get_distances(item, centroids):
    return [distance(item, centroid) for centroid in centroids]

# Update cluster assignments given a set of centroids
# input: list of data points, initial list of centroids
def update_clusters(data, centroids):
    # initialize clusters
    clusters = {}
    for i in range(len(centroids)):
        # set its cluster members to the empty list
        clusters[i] = []
    # initialize mappings
    mappings = {}
    # for each data point
    for j, datum in enumerate(data):
        # find the index of the centroid with the smallest distance to this data point
        min_cluster_index = np.argmin(get_distances(datum, centroids))
        # add this data point to that centroid's cluster
        clusters[min_cluster_index].append(datum)
        # add mapping
        mappings[j] = min_cluster_index
    return clusters, mappings

# Update the centroids given the data
def update_centroids(clusters, oldcentroids):
    # set centroids to empty list
    centroids = []
    # for each set of data points in a cluster around a single centroid
    for centroidid, data_in_cluster in clusters.items():
        # graciously handle case where no data ended up in a cluster
        if len(data_in_cluster) > 0:
            # new centroid is the mean of that cluster
            centroids.append(np.mean(data_in_cluster, axis=0))
        else:
            centroids.append(oldcentroids[centroidid])
    return centroids

# Measure the inertia
def inertia(data, centroids, clusters):
    sum = 0
    for i in clusters.keys():
        for datum in clusters[i]:
            # calculate the distance squared between each data point and its centroid
            sum += distance(datum, centroids[i])**2
    # average over the data
    return sum / len(data)

def fit_kmeans(data, k, cutoff=1):
    # make some initial centroids
    centroids = np.array([data[x] for x in np.random.choice(np.arange(len(data)), size=k, replace=False)])
    # initialize last_inertia
    last_inertia = -1
    while True:
        # get the clusters for these centroids
        clusters, mappings = update_clusters(data, centroids)
        # calculate the inertia for this clustering
        this_inertia = inertia(data, centroids, clusters)
        # stop when the inertia stops changing very much
        if last_inertia > 0 and abs(last_inertia - this_inertia) < cutoff:
            break
        last_inertia = this_inertia
        # update the centroids
        centroids = update_centroids(clusters, centroids)
    return centroids, clusters, mappings, this_inertia

On Wednesday we talked about the Silhouette coefficient as a way to evaluate the goodness of a clustering. We will use the scikit-learn implementation today.

In [None]:
from sklearn.metrics import silhouette_score

#s = silhouette_score(data, mappings/labels, metric='euclidean')



# Impact of normalization on K-Means clustering

Fill in this table.
1. Try all the types of normalization plus k-means clustering. Use a reasonable value for $k$ in k-means clustering, like 6 (maybe it will cluster them by condition!).
2. Try at least one type of normalization (centering!) plus PCA plus k-means clustering. Use the same value of $k$ for k-means clustering as you have so far. Pick a number of principal components that lets you keep at least 80% of the cumulative sum of variance.

| Normalization | PCA (None or k) | K-means k | Silhouette score | Time |
| ------------- | --------------- | --------- | ---------------- | ---- |
| None | None | ?? | | |
| Centering | None | ?? | | |
| Max-min global | None | ?? | | |
| Max-min local | None | ?? | | |
| Z-score | None | ?? | | |
| ?? | ?? | ?? | | |




In [None]:
k = 6

In [None]:
min_g = np.min(x_train)
max_g = np.max(x_train)
min_l = np.min(x_train, axis=0)
max_l = np.max(x_train, axis=0)
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='center')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='max-min-global')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='max-min-local')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

In [None]:
%time 

normalized_train = normalize(x_train, min_g, max_g, mean, std, method='zscore')
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

In [None]:
%time 

normalized_train = train
centroids, clusters, mappings, _ = fit_kmeans(normalized_train, k)
silhouette_score(normalized_train, [x[1] for x in sorted(mappings.items())], metric='euclidean')

**Bonus**: Now think about PCA. If we had a dataset with 1000 independent variables (like our car logo data), what do you think might be the impact of PCA-first on silhouette coefficient, and on time?