# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [83]:
import pandas as pd
import numpy as np
import folium

In [84]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [85]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [86]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [87]:
from sklearn.cluster import KMeans

In [88]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [89]:
#train model
train_data = myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

In [90]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [91]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [92]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [93]:
pred = myKMeans.predict(coordinates.to_numpy())

In [94]:
def show_cluster(cluster_number, pred, centers):
    
    coord = coordinates.to_numpy()[pred==cluster_number]
    
    entries = np.shape(coord)[0]
    print('entries:' , entries)
    
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
    for i in range(entries):
        folium.CircleMarker([coord[i,0], coord[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        folium.CircleMarker([coord[i,2], coord[i,3]], radius=3,     
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)  
    return cluster_map

In [95]:
show_cluster(10, pred, centers)

entries: 925


## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [96]:
def cluster_var(cluster_number, k):
    np_coord = coordinates.to_numpy()
    
    for i in range(k):
        coord = np_coord[pred==i]
        var_all = np.var(coord, axis=0, dtype=np.float64)
        print(i, var_all)

In [81]:
cluster_var(predictions,100)

0 [0.00011177 0.00010909 0.00012249 0.00010509]
1 [2.94536499e-05 2.57962458e-05 3.28426025e-05 2.90745699e-05]
2 [0.00020235 0.00010051 0.00024515 0.00016217]
3 [5.68612464e-05 6.38296365e-05 3.82006953e-05 2.97119909e-05]
4 [4.72120966e-05 5.84722085e-05 3.17686285e-04 3.53982607e-04]
5 [9.68515389e-05 7.90329567e-04 1.18990733e-06 4.81022113e-04]
6 [2.35971593e-04 1.68194950e-04 5.19799796e-05 6.28451854e-05]
7 [4.32775037e-05 4.86173901e-05 4.61996226e-05 3.53847671e-05]
8 [0.000987   0.0018086  0.00140689 0.00238192]
9 [0.00293911 0.00454012 0.00098063 0.00242336]
10 [0.00033417 0.00016836 0.00011839 0.00016284]
11 [2.95243437e-05 2.53077762e-05 3.42517976e-05 3.53368669e-05]
12 [1.17124627e-04 8.96981403e-05 8.01123814e-05 1.08117847e-04]
13 [0.00029598 0.00011044 0.00021003 0.0002751 ]
14 [0.00044086 0.00022885 0.00042423 0.0003533 ]
15 [9.09007649e-05 7.93911336e-05 6.11052434e-05 4.25157817e-05]
16 [5.53434640e-05 3.29045746e-05 5.97370558e-05 3.42010594e-05]
17 [4.91814983e-0

In [82]:
cluster_var(predictions,10)

0 [0.00011177 0.00010909 0.00012249 0.00010509]
1 [2.94536499e-05 2.57962458e-05 3.28426025e-05 2.90745699e-05]
2 [0.00020235 0.00010051 0.00024515 0.00016217]
3 [5.68612464e-05 6.38296365e-05 3.82006953e-05 2.97119909e-05]
4 [4.72120966e-05 5.84722085e-05 3.17686285e-04 3.53982607e-04]
5 [9.68515389e-05 7.90329567e-04 1.18990733e-06 4.81022113e-04]
6 [2.35971593e-04 1.68194950e-04 5.19799796e-05 6.28451854e-05]
7 [4.32775037e-05 4.86173901e-05 4.61996226e-05 3.53847671e-05]
8 [0.000987   0.0018086  0.00140689 0.00238192]
9 [0.00293911 0.00454012 0.00098063 0.00242336]
