In [1]:
import sys
sys.path.append('..')

import attack
import data_loader as dl

In [2]:
import numpy as np
import scipy as sp
from sklearn.cluster import KMeans
import utils
import tslearn
import geopandas as gp
from tqdm import tqdm

In [3]:
# Load preprocessed data 
raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf = dl.load_geolife()
assert len(raw_full_trip_gdf) == len(raw_trip_sp_gdf) == len(raw_trip_ep_gdf)

Reading splitted geolife geojson file...
Done.
Number of trajectories in loaded dataset: 26668
Number of users in loaded dataset: 172


In [4]:
print("\nMatching start and end points with tessellation...")
gdf_sp, gdf_ep = attack.match_boundary_points_with_tessellation(raw_trip_sp_gdf, raw_trip_ep_gdf, tesselation_gdf)
print("Done.")

# Extract Full Trips that Start and End within Tessellation Area
print("\nExtracting trips that start and end within tessellation area...")
full_trip_gdf, trip_sp_gdf, trip_ep_gdf, gdf_sp, gdf_ep = attack.extract_trips_that_start_end_in_tessellation(raw_full_trip_gdf, raw_trip_sp_gdf, raw_trip_ep_gdf, gdf_sp, gdf_ep)
print("Done.")


Matching start and end points with tessellation...
Done.

Extracting trips that start and end within tessellation area...
Number of trips that start and end wihin tessellation area: 23095
Number of trips outside and therefore dropped: 3573
Done.


In [None]:
# Write separate geojson files for each user
print("\nWriting separate geojson files for each user...")
for user_id in tqdm(full_trip_gdf['PERSON_ID'].unique()):
    utils.write_geojson(full_trip_gdf[full_trip_gdf['PERSON_ID'] == 134], 'geolife_data/user_' + str(user_id) + '.geojson')


In [5]:
A = attack.cdist(full_trip_gdf.geometry)
Q = np.zeros(A.shape)

for i in tqdm(range(len(A))):
    for j in range(len(A)):
        if A[i,j] > 0.5:
            Q[i,j] = 1

100%|██████████| 290/290 [06:59<00:00,  1.45s/it]
100%|██████████| 290/290 [00:00<00:00, 12657.90it/s]


In [3]:
full_trip_gdf

NameError: name 'full_trip_gdf' is not defined

In [6]:
def spectral_clustering(A):    
    D = np.diag(np.sum(A, axis=1))
    L = D - A
    eigvals, eigvecs = np.linalg.eig(L)

    n_dim = eigvecs.shape[0]
    p = np.zeros(n_dim)
    p[eigvecs[:,1] > 0] = 1.0

    return p

In [14]:
full_trip_gdf = full_trip_gdf.reset_index(drop=True)

In [15]:
X = full_trip_gdf.geometry.apply(lambda x: x)

In [19]:
X

0       LINESTRING (442818.279 4425260.053, 442808.220...
1       LINESTRING (442586.020 4425413.497, 442576.267...
2       LINESTRING (441711.057 4428615.035, 441726.734...
3       LINESTRING (451459.757 4419372.843, 451459.662...
4       LINESTRING (441421.774 4435745.239, 441437.450...
                              ...                        
1817    LINESTRING (442684.957 4425548.723, 442678.504...
1818    LINESTRING (442917.943 4425549.563, 442947.311...
1819    LINESTRING (443008.567 4425525.945, 443028.544...
1820    LINESTRING (442890.929 4425572.150, 442890.752...
1821    LINESTRING (442883.510 4425588.670, 442883.238...
Name: geometry, Length: 1822, dtype: geometry

In [18]:
attack.LCSS(X[0], X[3])

0.0

In [7]:
def constrained_spectral_clustering(A, Q):
    D = np.diag(np.sum(A, axis=1))
    vol = np.sum(A)

    D_norm = np.linalg.inv(np.sqrt(D))
    L_norm = np.eye(*A.shape) - D_norm.dot(A.dot(D_norm))
    Q_norm = D_norm.dot(Q.dot(D_norm))

    # alpha < max eigenval of Q_norm
    alpha = 0.6 * sp.linalg.svdvals(Q_norm)[0]
    Q1 = Q_norm - alpha * np.eye(*Q_norm.shape)

    val, vec = sp.linalg.eig(L_norm, Q1)

    vec = vec[:,val >= 0]
    vec_norm = (vec / np.linalg.norm(vec, axis=0)) * np.sqrt(vol)

    costs = np.multiply(vec_norm.T.dot(L_norm), vec_norm.T).sum(axis=1)
    ids = np.where(costs > 1e-10)[0]
    min_idx = np.argmin(costs[ids])
    min_v = vec_norm[:,ids[min_idx]]

    u = D_norm.dot(min_v)

    n_dim = u.shape[0]
    p = np.zeros(n_dim)
    p[u > 0] = 1.0

    return p

In [8]:
def constrained_spectral_clustering_K(A, Q, K):
    D = np.diag(np.sum(A, axis=1))
    vol = np.sum(A)

    D_norm = np.linalg.inv(np.sqrt(D))
    L_norm = np.eye(*A.shape) - D_norm.dot(A.dot(D_norm))
    Q_norm = D_norm.dot(Q.dot(D_norm))

    # alpha < K-th eigenval of Q_norm
    alpha = 0.6 * sp.linalg.svdvals(Q_norm)[K]
    Q1 = Q_norm - alpha * np.eye(*Q_norm.shape)

    val, vec = sp.linalg.eig(L_norm, Q1)

    vec = vec[:,val >= 0]
    vec_norm = (vec / np.linalg.norm(vec, axis=0)) * np.sqrt(vol)

    costs = np.multiply(vec_norm.T.dot(L_norm), vec_norm.T).sum(axis=1)
    ids = np.where(costs > 1e-10)[0]
    min_idx = np.argsort(costs[ids])[0:K]
    min_v = vec_norm[:,ids[min_idx]]

    u = D_norm.dot(min_v)

    model = KMeans(n_clusters=K).fit(u)
    labels = model.labels_

    return labels

In [20]:
from sklearn.cluster import SpectralClustering, DBSCAN

# sklearn_spec = SpectralClustering(n_clusters=5, affinity='precomputed', n_init=100, n_jobs=-1).fit(A).labels_

sklearn_dbscan = DBSCAN(eps=0.1, min_samples=1, metric=attack.LCSS).fit(X).labels_



TypeError: float() argument must be a string or a real number, not 'LineString'

In [68]:
DELTA = 0.6

attack.evaluate(sklearn_spec, full_trip_gdf)
attack.evaluate(sklearn_dbscan, full_trip_gdf)

attack.evaluate(spectral_clustering(np.exp(- (1-A) ** 2 / (2. * DELTA ** 2))), full_trip_gdf)
attack.evaluate(constrained_spectral_clustering(np.exp(- (1-A) ** 2 / (2. * DELTA ** 2)), Q), full_trip_gdf)
attack.evaluate(constrained_spectral_clustering_K(np.exp(- (1-A) ** 2 / (2. * DELTA ** 2)), Q, len(full_trip_gdf.PERSON_ID.unique())), full_trip_gdf)

Homogeneity: 0.416
Completeness: 0.540
V-measure: 0.470
Rand index: 0.722
ARI: 0.450
MI: 0.400
NMI: 0.470
AMI: 0.457
Cluster accuracy: 0.779
Homogeneity: 0.979
Completeness: 0.415
V-measure: 0.583
Rand index: 0.808
ARI: 0.606
MI: 0.942
NMI: 0.583
AMI: 0.507
Cluster accuracy: 0.748
Homogeneity: 0.022
Completeness: 0.031
V-measure: 0.026
Rand index: 0.495
ARI: -0.009
MI: 0.021
NMI: 0.026
AMI: 0.016
Cluster accuracy: 0.455
Homogeneity: 0.137
Completeness: 0.237
V-measure: 0.173
Rand index: 0.454
ARI: -0.076
MI: 0.132
NMI: 0.173
AMI: 0.165
Cluster accuracy: 0.462
Homogeneity: 0.598
Completeness: 0.421
V-measure: 0.494
Rand index: 0.656
ARI: 0.297
MI: 0.575
NMI: 0.494
AMI: 0.482
Cluster accuracy: 0.648


