## AIS Anomaly Detection

In [19]:
# from IPython.display import Image, HTML
import os
import numpy as np
import math
import pandas as pd
import datetime
from glob import glob
import geopy.distance
import folium
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()

import warnings
warnings.filterwarnings("ignore")           # Suppress Warning

### Global Variables

In [20]:
WorkingFolder = "/Users/cv0361/Desktop/TechChallenge/Data/csv/ConsolidatedAIS/"
OutputDir = WorkingFolder

PROC_YEAR = '2017'
MAX_CLUSTER = 5
MIN_PROCESS_ROW = 500          # Require min number of rows available to calc cluster

### Load Broadcast Data

In [21]:
Broadcast = pd.read_csv(WorkingFolder + "Broadcast_{}.csv".format(PROC_YEAR), sep=",", parse_dates=['date_time'])
Broadcast.head()

Unnamed: 0,mmsi_id,date_time,lat,lon,speed_over_ground,course_over_ground,voyage_id,heading,status
0,366940480,2017-01-04 11:39:36,52.4873,-174.02316,10.0,-140.7,,267.0,undefined
1,366940480,2017-01-04 11:40:45,52.48718,-174.02835,10.0,-141.6,,266.0,undefined
2,366940480,2017-01-04 11:42:26,52.48705,-174.03608,10.0,-142.3,,267.0,undefined
3,366940480,2017-01-04 13:51:07,52.41575,-174.60041,9.1,-154.0,,251.0,undefined
4,366940480,2017-01-04 13:55:17,52.41311,-174.61718,9.1,-157.3,,251.0,undefined


In [22]:
print("Raw Count:", Broadcast.shape[0])

Raw Count: 3125152


In [23]:
# Make sure voyage id is not null
Broadcast['voyage_id'] = Broadcast['voyage_id'].fillna(0)
Broadcast = Broadcast.astype({"voyage_id": int})           # cast type to int

### Clustering via K-means
* https://github.com/JosephMagiya/Clustering-GPS-Co-ordinates--Forming-Regions./blob/master/Clustering-GPS-Co-ordinates--Forming-Regions.ipynb

In [24]:
Header = ['mmsi_id', 'PingRecStart', 'PingRecEnd', 'TotalPing', 'LatStd', 'LonStd', 
          'MaxSOG', 'MinSOG', 'MeanSOG', 'MedianSOG', 'StdSOG', 
          'MaxCOG', 'MinCOG', 'MeanCOG', 'MedianCOG', 'StdCOG', 'AnoThreshold', 'AnoClusterCount']
arrData = list()

# Identify Anomaly
for mmsi in Broadcast.mmsi_id.unique():
#     mmsi = 366985110           #**************** Test data
    
    df = Broadcast.loc[Broadcast.mmsi_id==mmsi, ['date_time', 'lat', 'lon', 'speed_over_ground', 'course_over_ground']] 
    
    if df.shape[0] < MIN_PROCESS_ROW: continue              # Skip, not enough ping records for clustering
        
    Anomaly_THRESHOLD = math.ceil(df.shape[0]/15000)        # dynamically determine the anomaly threshold based on total pings
    
#     # **** for testing - Fudge some anomaly
#     df.loc[df.date_time=='2017-01-01 00:06:50', 'lat'] = 70.25
#     # *************************************

    # Extract Date and Hour
    df['PingDate'] = df['date_time'].dt.date
    # df['PingHour'] = df['date_time'].dt.hour

    # Extract Stats
    Stat = df.agg(['count', 'max', 'min', 'mean', 'median', 'std']) 
    X = df[['lat', 'lon']]
    
    kmeans = KMeans(n_clusters=MAX_CLUSTER, init='k-means++')
    kmeans.fit(X)                                   # Compute k-means clustering. 
    df['cluster_label'] = kmeans.predict(X)         # Labels of each point

    # Cluster, ping counts
    PingCluster = df.groupby('cluster_label')['date_time'].count().reset_index()
    
    # Identify Anomaly in coordinate, if any
    AnomalyCluster = PingCluster.loc[PingCluster.date_time <= Anomaly_THRESHOLD, 'cluster_label'].values
    
    # Construct Vessel Stat Record
    arrData.append([
        mmsi, Stat.PingDate[2], Stat.PingDate[1], Stat.PingDate[0], Stat.lat[5], Stat.lon[5],
        Stat.speed_over_ground[1], Stat.speed_over_ground[2], Stat.speed_over_ground[3], Stat.speed_over_ground[4], Stat.speed_over_ground[5], 
        Stat.course_over_ground[1], Stat.course_over_ground[2], Stat.course_over_ground[3], Stat.course_over_ground[4], Stat.course_over_ground[5],
        Anomaly_THRESHOLD, len(AnomalyCluster)
    ])
    
#     # ********************** Alert Code for identify 1st Anomaly case ********************************
#     # Locate anomaly data points
#     AnaCoordinate = df.loc[df.cluster_label.isin(AnomalyCluster)]
    
#     if AnaCoordinate.shape[0] > 0:
#         print("MMSI: {} \tPings: {} \tThreshold: {}".format(mmsi, df.shape[0], Anomaly_THRESHOLD))
#         for index, row in AnaCoordinate.sort_values('cluster_label').iterrows():
#             print("\tCluster:{} ({}, {}) {}".format(row.cluster_label, row.lat, row.lon, row.date_time))
    
#         break   # For Testing - get out early
#     # ***********************************************************************************************
    
#     break   # For Testing - get out early
    
dfStat = pd.DataFrame(arrData, columns=Header) 

dfStat.head()

Unnamed: 0,mmsi_id,PingRecStart,PingRecEnd,TotalPing,LatStd,LonStd,MaxSOG,MinSOG,MeanSOG,MedianSOG,StdSOG,MaxCOG,MinCOG,MeanCOG,MedianCOG,StdCOG,AnoThreshold,AnoClusterCount
0,366940480,2017-01-01,2017-01-28,8723,0.831555,4.678157,11.7,0.0,4.248298,0.1,4.452731,204.7,-204.8,-51.057996,-101.3,120.439669,1,0
1,273898000,2017-01-06,2017-01-09,1161,0.218489,0.392926,12.2,0.8,4.520413,4.0,1.916563,204.6,-204.8,-24.691645,28.5,131.95501,1,0
2,477027500,2017-01-15,2017-01-17,515,0.781598,4.62364,13.4,9.1,12.433981,12.7,0.831874,109.9,19.2,83.752039,88.1,19.687598,1,0
3,367390380,2017-01-20,2017-01-31,4598,0.973165,4.971375,13.4,0.0,3.481883,0.1,4.759571,204.7,-204.8,-62.143823,-102.55,119.397636,1,0
4,352844000,2017-01-27,2017-01-31,2042,0.947121,4.543238,15.8,0.0,4.441332,0.3,5.756986,204.7,-204.8,-10.677718,-65.1,141.127641,1,0


In [25]:
# Output Stat Data
dfStat.to_csv(OutputDir + "Statistic_{}.csv".format(PROC_YEAR), index=None, header = True)

In [26]:
Stat

Unnamed: 0,date_time,lat,lon,speed_over_ground,course_over_ground,PingDate
count,709,709.0,709.0,709.0,709.0,709
max,2017-01-31 23:59:03,54.52018,-164.75863,10.2,198.1,2017-01-31
min,2017-01-31 04:40:41,54.31477,-167.93358,0.2,-167.8,2017-01-31
mean,2017-01-31 13:57:18.871650304,54.425945,-166.766963,6.385331,-85.107898,
median,,54.4051,-166.88498,8.4,-129.9,
std,,0.057818,0.977964,3.398842,88.937824,


In [27]:
df.head()

Unnamed: 0,date_time,lat,lon,speed_over_ground,course_over_ground
3123814,2017-01-31 23:54:13,54.12723,-162.37597,12.9,-124.2
