# Exercise: Geographical Cluster Analysis of Taxi Rides
Using the NY Taxi data set (see Use Case Block I) and the use case from the lecture...

In [13]:
import pandas as pd
import numpy as np
import folium


In [14]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


In [15]:
# we load the data we have saved after wrangling and pre-processing in block I
train=pd.read_csv(path+'/DATA/train_cleaned.csv')

In [16]:
#select only the culumns with the ride coordinates
coordinates = train[ ['pickup_latitude','pickup_longitude','dropoff_latitude' , 'dropoff_longitude' ] ]

## Clustering approach from the lecture
we will be using simple K-Means:
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [5]:
from sklearn.cluster import KMeans

In [6]:
#define number of clusters and create instance
clusters=100
myKMeans=KMeans(n_clusters=clusters, n_jobs=-1)#parallelize to all cores

In [7]:
#train model
myKMeans.fit(coordinates.to_numpy()[:100000,:])#use only subset of the data to make it faster

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=100, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [8]:
#get cluster centers
centers=myKMeans.cluster_centers_
    

In [9]:
#draw map: green: start, red: end
cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
for i in range(clusters):
    folium.CircleMarker([centers[i,0], centers[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.CircleMarker([centers[i,2], centers[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    folium.PolyLine([ [centers[i,0],centers[i,1]] , [centers[i,2],centers[i,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)

In [10]:
cluster_map

## Exercise 1
Write a function ```show_cluster(cluster_number,...)``` that draws the cluster centers and all start and end points of a given cluster in the map.
* use the ```predict()``` method to map all data in ```train_data``` to a cluster center
* use ```folium.CircleMarker``` to draw all members of a given cluster


In [17]:
predictions = myKMeans.predict(coordinates.to_numpy())
predictions[:100]

array([ 2, 29, 71,  8, 46, 56, 32, 11, 49,  8, 20,  1, 98, 34, 76, 44,  3,
       32, 22, 82, 12, 72, 51, 22, 71, 30, 44, 47, 99,  6, 44, 99, 18, 22,
       96, 25, 61,  1, 56, 64, 19, 34, 76, 47, 71, 71,  1,  1, 15, 69,  1,
       18, 38, 26, 87, 89, 20, 72, 49, 34, 61, 34, 16,  1, 39, 87, 15, 98,
       87, 82,  3, 64, 86, 72, 18, 71, 72, 61, 16, 15, 29, 27,  1, 48, 56,
       72, 96, 56, 51,  3,  1, 82, 25, 61, 71, 98, 20, 72, 12, 39],
      dtype=int32)

In [19]:

def show_clusters(cluster_number, predictions, center):
    cluster_coord = coordinates.to_numpy()[predictions==cluster_number]
    entries = np.shape(cluster_coord)[0]
    print("number of cluster enries:" , entries)
    
    #create map
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
    for i in range(entries):
        folium.CircleMarker([cluster_coord[i,0], cluster_coord[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        folium.CircleMarker([cluster_coord[i,2], cluster_coord[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    return cluster_map
show_clusters(50,predictions,centers)
    

number of cluster enries: 274


In [58]:

def show_cluster(cluster_number, predictions, centers):
    #get coordinates of the target cluster
    #coordinates were all coordinate collumns from the train data
    cluster_coord = coordinates.to_numpy()[predictions==cluster_number]
    
    #get number of entries:
    entries = np.shape(cluster_coord)[0]
    print("number of cluster enries:" , entries)
    
    #create map
    cluster_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 12,)
    
    #plot centers
    folium.PolyLine([ [centers[cluster_number,0],centers[cluster_number,1]] , [centers[cluster_number,2],centers[cluster_number,3]]  ], color="black", weight=2.5, opacity=1).add_to(cluster_map)
    
    for i in range(entries):
        folium.CircleMarker([cluster_coord[i,0], cluster_coord[i,1]], radius=3,                
                        color="green", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
        folium.CircleMarker([cluster_coord[i,2], cluster_coord[i,3]], radius=3,                
                        color="red", 
                        fill_opacity=0.9
                       ).add_to(cluster_map)
    return cluster_map

## Exercise 2
Write a function ```cluster_var(cluster_number,...)``` that computes the intra- and extra cluster variance for a given cluster. Apply it to all clusters and compare the results for k=100 and k=10.

In [22]:
def cluster_var(predictions, k):
    #get coordinates of the target cluster
    #coordinates were all coordinate collumns from the train data
    np_coord = coordinates.to_numpy()
    
    #compute min and max coordiantes for normalization
    min_cor = np.min(np_coord, axis=0)
    max_cor = np.max(np_coord, axis=0)
    dist = max_cor - min_cor
    print ("max distances:", dist)
    for i in range(k):
        cluster_coord = np_coord[predictions==i]
    
        #compute variances
        cluster_var = np.var(cluster_coord, axis=0)
        print(i, cluster_var)

In [23]:
cluster_var(predictions,100)

max distances: [1.073698 1.177225 0.970017 1.169471]
0 [0.00015087 0.00017611 0.00019781 0.00026693]
1 [2.79500396e-05 2.91059441e-05 2.86315589e-05 2.55981212e-05]
2 [0.00046704 0.00057133 0.00059779 0.00072685]
3 [5.39506853e-05 3.18660495e-05 5.26410353e-05 3.04108172e-05]
4 [9.68515389e-05 7.90329567e-04 1.18990733e-06 4.81022113e-04]
5 [5.38134378e-05 7.03779506e-05 1.43607193e-04 1.84409262e-04]
6 [4.90679968e-05 6.32777164e-05 8.96108668e-05 8.74191709e-05]
7 [0.00040137 0.00067449 0.00067842 0.0011069 ]
8 [3.78930557e-05 2.91254774e-05 3.67792268e-05 4.31750940e-05]
9 [5.29664465e-05 3.27885591e-05 6.13086435e-05 3.60472656e-05]
10 [0.00293911 0.00454012 0.00098063 0.00242336]
11 [5.08057132e-05 5.90970669e-05 5.80457312e-05 6.62904801e-05]
12 [3.02545099e-05 2.40476935e-05 2.96009796e-05 3.48840112e-05]
13 [2.35405424e-04 1.87787254e-04 5.71303683e-05 9.55818030e-05]
14 [0.00015541 0.00016155 0.00020983 0.00023276]
15 [5.38270318e-05 4.14816684e-05 6.42899527e-05 4.32012989e-0