## AIS Anomaly Detection

In [33]:
# from IPython.display import Image, HTML
import os
import numpy as np
import math
import pandas as pd
import datetime
from glob import glob
import geopy.distance
import folium
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()

import warnings
warnings.filterwarnings("ignore")           # Suppress Warning

### Global Variables

In [10]:
WorkingFolder = "/Users/cv0361/Desktop/TechChallenge/Data/csv/ConsolidatedAIS/"
OutputDir = WorkingFolder
MAX_CLUSTER = 5

### Load Broadcast Data

In [13]:
Broadcast = pd.read_csv(WorkingFolder + "Broadcast_2014.csv", sep=",", parse_dates=['date_time'])
Broadcast.head()

Unnamed: 0,mmsi_id,date_time,lat,lon,speed_over_ground,course_over_ground,voyage_id,heading,status
0,366025993,2013-12-31 23:57:44,47.581332,-122.361145,0.0,39.599998,1,511,0
1,367160890,2013-12-31 23:57:44,45.835737,-123.990592,6.7,355.39999,2,359,15
2,366490600,2013-12-31 23:57:44,47.631067,-122.382117,0.0,192.10001,3,180,7
3,338000406,2013-12-31 23:57:44,48.123443,-123.444115,0.0,14.2,4,511,0
4,367840001,2013-12-31 23:57:44,48.121267,-122.726412,11.4,55.400002,5,57,0


In [14]:
print("Raw Count:", Broadcast.shape[0])

Raw Count: 21517764


In [15]:
# Make sure voyage id is not null
Broadcast['voyage_id'] = Broadcast['voyage_id'].fillna(0)
Broadcast = Broadcast.astype({"voyage_id": int})           # cast type to int

### Clustering via K-means
* https://github.com/JosephMagiya/Clustering-GPS-Co-ordinates--Forming-Regions./blob/master/Clustering-GPS-Co-ordinates--Forming-Regions.ipynb

In [27]:
# Identify Anomaly
for mmsi in Broadcast.mmsi_id.unique():
    
    mmsi = 366985110           #**************** Test data
    
    df = Broadcast.loc[Broadcast.mmsi_id==mmsi, ['date_time', 'lat', 'lon']] 
    Anomaly_THRESHOLD = math.ceil(df.shape[0]/15000)     # dynamically determine the anomaly threshold based on total pings
    
#     # **** for testing - Fudge some anomaly
#     df.loc[df.date_time=='2017-01-01 00:06:50', 'lat'] = 70.25
#     # *************************************

    X = df[['lat', 'lon']]
    
    kmeans = KMeans(n_clusters=MAX_CLUSTER, init='k-means++')
    kmeans.fit(X)                                   # Compute k-means clustering. 
    df['cluster_label'] = kmeans.predict(X)         # Labels of each point
    centers = kmeans.cluster_centers_               # Coordinates of cluster centers.
    
    # Cluster, ping counts
    PingCluster = df.groupby('cluster_label')['date_time'].count().reset_index()
    
    # Identify Anomaly in coordinate, if any
    AnomalyCluster = PingCluster.loc[PingCluster.date_time <= Anomaly_THRESHOLD, 'cluster_label'].values
    
    # Locate anomaly data points
    AnaCoordinate = df.loc[df.cluster_label.isin(AnomalyCluster)]
    
    if AnaCoordinate.shape[0] > 0:
        print("MMSI: {} \tPings: {} \tThreshold: {}".format(mmsi, df.shape[0], Anomaly_THRESHOLD))
        
        for index, row in AnaCoordinate.sort_values('cluster_label').iterrows():
            print("\tCluster:{} ({}, {}) {}".format(row.cluster_label, row.lat, row.lon, row.date_time))
    
        break
    
    break

MMSI: 366985110 	Pings: 33766 	Threshold: 3
	Cluster:1 (46.199397999999995, -123.93110700000001) 2014-01-21 16:28:50
	Cluster:1 (46.197055, -123.924967) 2014-01-21 16:29:50
	Cluster:1 (46.19781500000001, -123.92719199999999) 2014-01-21 16:30:52
	Cluster:3 (46.202828000000004, -123.939575) 2014-01-21 16:26:45
	Cluster:3 (46.201563, -123.937022) 2014-01-21 16:27:46
	Cluster:3 (46.20244, -123.934802) 2014-01-21 16:31:51
	Cluster:4 (46.211372, -123.948995) 2014-01-21 16:33:59
	Cluster:4 (46.211338, -123.95091799999999) 2014-01-21 16:34:59
	Cluster:4 (46.20995300000001, -123.95029299999999) 2014-01-21 16:36:00


In [28]:
df.head()

Unnamed: 0,date_time,lat,lon,cluster_label
64,2013-12-31 23:57:44,46.204338,-123.950212,0
823,2014-01-01 00:00:44,46.204383,-123.950225,0
1439,2014-01-01 00:01:54,46.204358,-123.950217,0
1946,2014-01-01 00:02:53,46.204418,-123.950237,0
2667,2014-01-01 00:04:24,46.204395,-123.950195,0
