In [None]:
from datascience import *
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

### N.B. You will have to have scikit-learn installed to run this minilab.

# Introduction to Clustering
[Scikit-learn](http://scikit-learn.org/stable/) is a python package that can help you to do more advanced predictive and exploratory analysis with data. Today we are going to learn about a [clustering method](http://scikit-learn.org/stable/modules/clustering.html#k-means) used for systematically grouping similar datapoints.

In [None]:
plt.figure(figsize=(12, 12))

n_samples = 1500
n_blobs = 7

random_state = 33


X, y = make_blobs(n_samples=n_samples, centers=n_blobs, random_state=random_state)

plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=np.ones(n_samples))
plt.title("Data")


# Number of clusters
y_pred = KMeans(n_clusters=n_blobs, random_state=random_state).fit_predict(X)

plt.subplot(222)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Correct number of clusters")


y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

plt.subplot(223)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Wrong number of clusters: too few")


y_pred = KMeans(n_clusters=8, random_state=random_state).fit_predict(X)

plt.subplot(224)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Wrong number of clusters: too many")

# Clustering EV users with similar driving and charging patterns

## About the dataset
'EV_soc.csv' contains data on the state of charge (SOC), meaning the % battery remaining, for 1023 Electric Vehicles (EVs). The dataset has the SOC for each car for every 5 minute interval in the day. The driver column indicates the driver id, the day indicates the day of the week, where 1=Sunday. The ##_soc columns each correspond to a 5-min interval of the day.

In this lab we will see how we can use clustering to identify drivers with similar driving and charging habits.


In [None]:
soc = Table.read_table('EV_soc.csv')
soc

# Clustering EVs with similar Tuesday Charging Habits

In the following section I grabbed the SOC data where day=3 (Tuesday). The Scikit-learn K-means package does all of the heavy lifting for us, and finds ways to group similar drivers that have the most similar SOC data throughout the day.  I found 5 clusters works well to identify unique driving/charging habbits.

The rolling_window=6 parameter is used to compute the rolling average over a half hour timespan, rather than considering each 5 min interval independently. 



In [None]:
n_clusters = 5
rolling_window = 6
soc_tuesday = soc.where('day',3)

#get color map array
jet = cm = plt.get_cmap('jet') 
cNorm  = colors.Normalize(vmin=0, vmax=n_clusters-1)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
colorVals = [scalarMap.to_rgba(i) for i in range(n_clusters)]


X = soc_tuesday.drop(['driver','day']).to_df()
X_rolling = X.T.rolling(window=rolling_window).mean()[rolling_window-1::].T

estimator = KMeans(n_clusters=n_clusters)
estimator.fit(X_rolling)

    
for i in range(n_clusters):
    x_ticks = np.arange(rolling_window-1,289)/12.
    plt.figure()
    data = X_rolling[estimator.labels_==i]
    n = len(data)
    for j in range(data.shape[0]):
        plt.plot(x_ticks,data.T[data.index[j]], color=colorVals[i], alpha=.05)
    plt.plot(x_ticks,np.mean(data), color=colorVals[i], linewidth=2)
    
    plt.xlabel('Hour')
    plt.ylabel('State of charge')
    plt.ylim(0,1.2)
    plt.title('State of charge per EV')
    textstr = 'N = %i, %.1f%% of vehicles'%(n,float(n*100/len(X_rolling)))
    plt.text(7, 1.1, textstr, fontsize=10,verticalalignment='top')



# Overlaying the clusters and plotting the derivatives


In [None]:
def get_slope(y, x):
    dys = y[1:]-y[:-1]
    dxs = x[1:]-x[:-1]
    return dys/dxs


In [None]:
plt.figure(figsize = (8,8))
for i in range(100):
    plt.plot(x_ticks, X_rolling.T[i], color = colorVals[estimator.labels_[i]], alpha=.1)
    
for i in range(n_clusters):
    plt.plot(x_ticks, np.mean(X_rolling[estimator.labels_==i]), 
             color=colorVals[i], linewidth=2)

plt.xlabel('Hour')
plt.ylabel('State of charge')
plt.title('State of charge per EV')


#Plot the derivative
plt.figure(figsize = (8,8))
for i in range(n_clusters):
    dy_dx = get_slope(np.array(np.mean(X_rolling[estimator.labels_==i])), x_ticks)
    plt.plot(x_ticks[:-1], dy_dx, color=colorVals[i], linewidth=2)
    
plt.plot(x_ticks[:-1], np.zeros(len(x_ticks[:-1])), color = 'grey')
plt.xlabel('Hour')
plt.ylabel('Change in state of charge')
plt.title('Change in state of charge per EV')


## Exercise 
My initial thought was that there would maybe be 2 unique charging patterns, one for commuters, and another for families who use EVs as a 2nd vehicle, or non-commuter vehicle. 

**Task 1 -** In the above code blocks, adjust the number of clusters to 2 and describe the trends in the two clusters. 

**Task 2 -** Now adjust the number of clusters to 10. Do you see multiple clusters that show very similar SOC patterns? If so then these can probably be combined, and we can reduce the number of clusters.

In [None]:
# Your answers here:



# Clustering behavior for the whole workweek
In the previous section we clustered EV data for a single work day. Now we will look to cluster similar driving and charging behacior for the whole workweek. Each row in 'workweek_soc.csv' contains EV SOC data for the entire workweek rather than a single day. 

Again I found that 5 clusters seemed to capture the unique charging behavior well. Run the code below to see the workweek clustering results.

In [None]:
week_soc = Table.read_table('workweek_soc.csv')
week_df = week_soc.drop('driver').to_df()

n_clusters=7
rolling_window = 6

#get color map array
jet = plt.get_cmap('jet') 
cNorm  = colors.Normalize(vmin=0, vmax=n_clusters-1)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
colorVals = [scalarMap.to_rgba(i) for i in range(n_clusters)]


# X_rolling = (pd.rolling_mean(week_df.T,window=rolling_window)[rolling_window-1::]).T
X_rolling = week_df.T.rolling(window=rolling_window).mean()[rolling_window-1::].T


estimator = KMeans(n_clusters=n_clusters)
estimator.fit(X_rolling)

    
for i in range(n_clusters):
    x_ticks = np.arange(rolling_window-1,289*5)/12.
    plt.figure()
    data = X_rolling[estimator.labels_==i]
    n = len(data)
    for j in range(data.shape[0]):
        plt.plot(x_ticks,data.T[data.index[j]], color=colorVals[i], alpha=.05)
    plt.plot(x_ticks,np.mean(data), color=colorVals[i], linewidth=2)
    
    plt.xlabel('Hour')
    plt.ylabel('State of charge')
    plt.title('State of charge per EV for a workweek\n')
    textstr = 'N = %i, %.1f%% of vehicles'%(n,float(n*100/len(X_rolling)))
    plt.text(7, 1.1, textstr, fontsize=10,verticalalignment='top')


# Overlaying workweek SOC plots

In [None]:
plt.figure(figsize=(12,10))
for i in range(100):
    plt.plot(x_ticks, X_rolling.T[i], color = colorVals[estimator.labels_[i]], alpha=.1)
    
for i in range(n_clusters):
    plt.plot(x_ticks, np.mean(X_rolling[estimator.labels_==i]), color=colorVals[i], linewidth=2)

    
plt.xlabel('Hour')
plt.xlim(0,24*5)
plt.ylabel('State of charge')
plt.title('State of charge per EV for a workweek')


## Exercise  

**Task -** Come up with a very brief description of the driving/charging behavior for each of the clusters shown above. For example, one such description might be 'Friday chargers' or 'evening chargers'.

In [None]:
# Your answer here