# Cluster

In [24]:
import numpy as np
import pandas as pd
import datetime as dt
import pickle
import ast
import psycopg2 as pg
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans
import sklearn.preprocessing as preprocessing

%matplotlib inline
import matplotlib.pyplot as plt

### Inputs and connection string

In [2]:
# model configurations

file_args = yaml.load(open('./conf/pipeline_args.yml','r'))

In [3]:
# create db connection objects

conn_str_file = './conf/db_conn_str.txt'

pg_conn_str = open(conn_str_file, 'r').read()
conn = pg.connect(pg_conn_str)

sqlalchemy_conn_str = open('./conf/sqlalchemy_conn_str.txt', 'r').read()
engine = create_engine(sqlalchemy_conn_str)

## Load training data

In [4]:
sampled_df = pickle.load(open('./data/train_test/processed_training_data.pkl','rb'))

# Prepare clustering data set and train test split

In [5]:
sampled_df.columns

Index(['index', 'date', 'time', 'date_idx', 'time_idx', 'day_of_week',
       'segment_id', 'lat1', 'lon1', 'lat2', 'lon2', 'level_min', 'level_max',
       'level_mean', 'level_count', 'level_binary', 'road_type_1',
       'road_type_2', 'road_type_3', 'road_type_4', 'road_type_6',
       'road_type_7', 'historical_level_max_average',
       'overall_level_max_average', 'event', 'exp_attendance', 'padres_game'],
      dtype='object')

## Remove features for cluster training

In [6]:
# drop unnecessary columns
drop_columns_df = sampled_df.drop(columns=['index','date','time','segment_id','level_min','level_max',
                         'level_mean','level_count','level_binary'])

In [7]:
# take level_max as time series average grouped by time and dow (only count historical incidents up to current date)
historical_level_df = drop_columns_df.drop(columns=['overall_level_max_average'])
# take level_max as non time series average grouped by time and dow (group all incidents regardless of when they occurred)
overall_level_df = drop_columns_df.drop(columns=['historical_level_max_average'])

## Scale features of training data

In [8]:
cluster_data_df = pd.DataFrame(preprocessing.scale(overall_level_df),columns=overall_level_df.columns)

In [9]:
cluster_data_df.head()

Unnamed: 0,date_idx,time_idx,day_of_week,lat1,lon1,lat2,lon2,road_type_1,road_type_2,road_type_3,road_type_4,road_type_6,road_type_7,overall_level_max_average,event,exp_attendance,padres_game
0,1.130568,-1.895563,-1.536475,1.23158,1.207839,1.255179,1.252538,-0.478203,1.70282,-0.375553,-0.288857,-0.399518,-0.52982,-0.677688,-0.196607,-0.184833,-0.229057
1,1.327178,-1.895563,-1.536475,1.23158,1.207839,1.255179,1.252538,-0.478203,1.70282,-0.375553,-0.288857,-0.399518,-0.52982,-0.677688,-0.196607,-0.184833,-0.229057
2,0.344127,-1.818711,-1.536475,1.23158,1.207839,1.255179,1.252538,-0.478203,1.70282,-0.375553,-0.288857,-0.399518,-0.52982,-0.677688,-0.196607,-0.184833,-0.229057
3,0.540737,-1.818711,-1.536475,1.23158,1.207839,1.255179,1.252538,-0.478203,1.70282,-0.375553,-0.288857,-0.399518,-0.52982,-0.677688,-0.196607,-0.184833,-0.229057
4,-0.638925,-1.665008,-1.536475,1.23158,1.207839,1.255179,1.252538,-0.478203,1.70282,-0.375553,-0.288857,-0.399518,-0.52982,-0.677688,-0.196607,-0.184833,-0.229057


## Train_test_split

In [10]:
print('splitting train and test data...')
train_data, test_data = train_test_split(cluster_data_df, test_size = 0.1)

splitting train and test data...


# Clustering

## Model

In [11]:
model = MiniBatchKMeans(n_clusters=20, init='k-means++')

In [12]:
model.fit(train_data)
predictions_train = model.predict(train_data)

In [13]:
predictions_test = model.predict(test_data)

## Concatenate cluster results to original training data

In [14]:
predictions_all = np.concatenate((predictions_train,predictions_test))

In [15]:
sampled_df_clustered = sampled_df.merge(pd.DataFrame(predictions_all, columns=['cluster'], 
                                                     index = np.concatenate((train_data.index,test_data.index))),
                                        left_index=True, right_index=True)

# Evaluation

### Manually sanity check some clusters

In [16]:
sampled_df_clustered[sampled_df_clustered['cluster'] == 0]

Unnamed: 0,index,date,time,date_idx,time_idx,day_of_week,segment_id,lat1,lon1,lat2,...,road_type_3,road_type_4,road_type_6,road_type_7,historical_level_max_average,overall_level_max_average,event,exp_attendance,padres_game,cluster
87,546,2017-05-08,18:30:00,89,1110,0,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
172,489,2017-05-09,11:30:00,90,690,1,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
208,644,2017-05-02,17:30:00,83,1050,1,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
216,855,2017-05-02,19:00:00,83,1140,1,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
227,754,2017-05-16,21:30:00,97,1290,1,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
282,320,2017-05-31,11:00:00,112,660,2,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
290,471,2017-05-31,12:30:00,112,750,2,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.117647,0.500000,0,0.0,1,0
329,693,2017-05-03,18:30:00,84,1110,2,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.230769,0.750000,0,0.0,1,0
418,690,2017-05-18,12:30:00,99,750,3,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0
423,498,2017-05-04,13:00:00,85,780,3,18,32.762914,-117.128252,32.763334,...,0,0,0,0,0.000000,0.000000,0,0.0,1,0


In [17]:
sampled_df_clustered[sampled_df_clustered['cluster'] == 1]

Unnamed: 0,index,date,time,date_idx,time_idx,day_of_week,segment_id,lat1,lon1,lat2,...,road_type_3,road_type_4,road_type_6,road_type_7,historical_level_max_average,overall_level_max_average,event,exp_attendance,padres_game,cluster
891,1898,2017-04-17,00:00:00,68,0,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
892,1065,2017-04-24,00:00:00,75,0,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
893,1675,2017-06-12,00:00:00,124,0,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
894,1323,2017-04-17,00:30:00,68,30,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
895,1803,2017-02-13,01:00:00,5,60,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
896,1687,2017-03-20,01:00:00,40,60,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
897,1385,2017-05-08,01:00:00,89,60,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
898,1656,2017-05-22,01:00:00,103,60,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
899,1488,2017-02-13,01:30:00,5,90,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,0.000000,0,0.0,0,1
900,1955,2017-04-03,02:00:00,54,120,0,46,32.749029,-117.205133,32.749214,...,0,0,0,1,0.000000,1.500000,0,0.0,0,1


In [18]:
sampled_df_clustered[sampled_df_clustered['cluster'] == 2]

Unnamed: 0,index,date,time,date_idx,time_idx,day_of_week,segment_id,lat1,lon1,lat2,...,road_type_3,road_type_4,road_type_6,road_type_7,historical_level_max_average,overall_level_max_average,event,exp_attendance,padres_game,cluster
8034,8096,2017-04-03,00:00:00,54,0,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8035,8149,2017-04-24,00:00:00,75,0,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8036,8380,2017-02-13,00:30:00,5,30,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8037,8716,2017-02-27,00:30:00,19,30,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8038,8693,2017-03-27,00:30:00,47,30,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8039,8830,2017-05-08,00:30:00,89,30,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8040,8446,2017-02-13,01:30:00,5,90,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8041,8393,2017-06-12,01:30:00,124,90,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8042,8309,2017-05-15,02:00:00,96,120,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2
8043,8384,2017-06-05,02:00:00,117,120,0,194,32.713633,-117.248724,32.713540,...,0,0,0,0,0.000000,0.0,0,0.0,0,2


### Check number of records per cluster

Even distribution of records per cluster would be good. Would mean each cluster has more meaning (rather than most incidents being in a single cluster which doesn't give us enough information.

In [19]:
for x in sampled_df_clustered['cluster'].unique():
    row_count = sampled_df_clustered.loc[sampled_df_clustered['cluster'] == x].shape[0]
    print('Cluster {} has {} elements'.format(x,row_count))

Cluster 6 has 1853263 elements
Cluster 0 has 296714 elements
Cluster 4 has 256120 elements
Cluster 1 has 1383265 elements
Cluster 3 has 867360 elements
Cluster 2 has 1009208 elements
Cluster 5 has 490450 elements
Cluster 7 has 725680 elements


# Write processed dataframe to pkl and sql

In [20]:
# write clustered training data to database
if file_args['write_sql_db']:
    sampled_df_clustered.to_sql(name='clustered_training_data', con=engine, 
                  if_exists='replace', index=False, chunksize=1000)

In [22]:
# write clustered dataframe to pickle file
if file_args['write_pickle_file']:
    pickle.dump(sampled_df_clustered, open('./data/train_test/clustered_training_data.pkl', 'wb'), protocol=4)