## Import necessary packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
from sklearn.metrics import pairwise_distances

## Implement k-means

Let us implement the k-means algorithm. First, we choose an initial set of centroids. A common practice is to choose randomly from the data points.


In [2]:
def get_initial_centroids(data, k, seed=None):
    '''Randomly choose k data points as initial centroids'''
    if seed is not None: 
        np.random.seed(seed)
    n = data.shape[0] 
    rand_indices = np.random.randint(0, n, k)

    centroids = data[rand_indices,:].toarray()
    
    return centroids

In [3]:

def assign_clusters(data, centroids):
    
   
    distances_from_centroids = pairwise_distances(data,centroids)
    
 
    cluster_assignment = np.argmin(distances_from_centroids,axis=1)
    
    return cluster_assignment

In pseudocode, we iteratively do the following:
```
cluster_assignment = assign_clusters(data, centroids)
centroids = revise_centroids(data, k, cluster_assignment)
```

### Assigning clusters

In [4]:
data = np.array([[1., 2., 0.],
                 [0., 0., 0.],
                 [2., 2., 0.]])
centroids = np.array([[0.5, 0.5, 0.],
                      [0., -0.5, 0.]])

Let's assign these data points to the closest centroid.

In [5]:
cluster_assignment = assign_clusters(data, centroids)
print (cluster_assignment)

[0 1 0]


In [6]:
def revise_centroids(data, k, cluster_assignment):
    new_centroids = []
    for i in range(k):
       
        member_data_points = data.loc[data.cluster_assignment==i,:]
        
        centroid = np.mean(member_data_points)
        
       
        centroid = centroid.A1
        new_centroids.append(centroid)
    new_centroids = np.array(new_centroids)
    
    return new_centroids

**The Basic objective of k-means is to reduce overall euclidean distance inside the cluster **

In [7]:
def compute_heterogeneity(data, k, centroids, cluster_assignment):
    
    heterogeneity = 0.0
    for i in range(k):
        
        
        member_data_points = data[cluster_assignment==i, :]
        
        if member_data_points.shape[0] > 0: 
           
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)
        
    return heterogeneity

Let's apply the above algorithm in our GPS  data. **Note :We can't use euclidean distance here **

### Loading data
** Here we  will load only 10,000 rows of our 'all.csv' file  After doing this we will generalize our result to whole data**

In [9]:
data = pd.read_csv('all.csv',nrows=10000)

## Finding the number of cluster 

**If a user is spending matore than 10 minutes at a particular location . That location is assigned as centroid **

In [10]:
data.head()

Unnamed: 0,index,lat,long,altitude,trajectory_id,subfolder,labels,datetime,distance,timedelta,velocity,acceleration
0,0,39.984702,116.318417,492.0,20081023025304,0,,2008-10-23 02:53:04,3.520694,0 days 00:00:06.000000000,0.586782,-0.003189
1,1,39.984683,116.31845,492.0,20081023025304,0,,2008-10-23 02:53:10,2.838241,0 days 00:00:05.000000000,0.567648,-0.003841
2,2,39.984686,116.318417,492.0,20081023025304,0,,2008-10-23 02:53:15,2.74222,0 days 00:00:05.000000000,0.548444,0.332144
3,3,39.984688,116.318385,492.0,20081023025304,0,,2008-10-23 02:53:20,11.045822,0 days 00:00:05.000000000,2.209164,0.39113
4,4,39.984655,116.318263,492.0,20081023025304,0,,2008-10-23 02:53:25,20.824082,0 days 00:00:05.000000000,4.164816,0.072513


In [84]:
centroid_index=[]
def find_cluster(data,spent_time):
    """
    
    Trying to find the number of cluster based upon time spent 
    
    """
    a=np.array(data['timedelta'])
    for i in range(len(a)):
        centroid_index.append(a[i][13:15]>spent_time)
    return centroid_index

In [85]:
find_cluster(data,'10')

TypeError: 'float' object is not subscriptable

In [91]:
data['cluster_check']=data['timedelta'].apply(lambda x : x[13:15] >'10')

TypeError: '>' not supported between instances of 'float' and 'str'

In [97]:
data.timedelta[0][13:15]

'06'