## Objective
This script clusters the stationary ship locations to infer port coordinates and count the number of distinct ship arrivals per day.
The HDBSCAN algorithm was employed for clustering. As discussed previously, the clustering model was employed for each geohash  independently.

The only issue with this approach is that port locations at geohash boundaries (if any) will not be inferred correctly. But this a tradeoff that must be made since it compuationally unfeasible to apply clustering on a global scale.

#### Libraries used
Pandas, hdbscan, sklearn

In [1]:
import pandas as pd
import numpy as np
import pygeohash as gh
from tqdm import tqdm
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from geopy.distance import great_circle
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
from glob import glob
import hdbscan
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
def get_centermost_point(group):
    mp = MultiPoint(group[["longitude", "latitude"]].values)
    return pd.Series({"centroid_lon": mp.centroid.x,
                      "centroid_lat": mp.centroid.y})

In [3]:
def get_ships_per_port(group):
    ret={}
    ret["port_lat"]=group["centroid_lat"].mean()
    ret["port_lon"]=group["centroid_lon"].mean()
    ret["number_ships"]=len(group.lrimoshipno.unique())
    return pd.Series(ret)


In [4]:
kms_per_radian = 6371.0088
epsilon = 10.0 / kms_per_radian


def fit_db_scan(ports):
    coords = ports[['latitude', 'longitude']].values
#     db = DBSCAN(eps=epsilon, min_samples=50, n_jobs=4,
#                 algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

    db = hdbscan.HDBSCAN(
        min_cluster_size=100, metric='haversine', core_dist_n_jobs=4,
        cluster_selection_epsilon=epsilon, algorithm='prims_balltree',
        cluster_selection_method='eom').fit(np.radians(coords))
    ports["cluster_labels"] = pd.Series(db.labels_)
    ports = ports.loc[ports.cluster_labels >= 0]

    centroids = ports.groupby("cluster_labels").apply(get_centermost_point)
    centroids.reset_index(inplace=True)
    ports = ports.merge(centroids, how="left", on="cluster_labels")
    return ports

In [5]:
geohashes = glob("data/port_locations/*/")
for path in geohashes:
    geohash = path.split("=")[1]
    ports = pd.read_parquet(path)
    if ports.shape[0] < 100:
        continue
    ports.reset_index(drop=True, inplace=True)
    print(f"Fit HDBScan for {geohash} {ports.shape[0]} data points")
    ports = fit_db_scan(ports)
    if(ports.shape[0] > 0):
        stats = ports.groupby(["cluster_labels", "date"]
                              ).apply(get_ships_per_port).reset_index()
        
        stats.to_csv(f"data/port_counts/geohash_{geohash[:-1]}_ports.csv", index=False)

        print(
            f"generated {len(stats.cluster_labels.unique())} port counts for geohash {geohash[:-1]}")

Fit HDBScan for 4qt/ 104 data points
Fit HDBScan for 66t/ 1752 data points
generated 2 port counts for geohash 66t
Fit HDBScan for 66v/ 5942 data points
generated 2 port counts for geohash 66v
Fit HDBScan for 67j/ 5401 data points
generated 2 port counts for geohash 67j
Fit HDBScan for 67v/ 242 data points
Fit HDBScan for 6m6/ 18061 data points
generated 2 port counts for geohash 6m6
Fit HDBScan for 6pw/ 223 data points
Fit HDBScan for 6q2/ 105 data points
Fit HDBScan for 75b/ 40723 data points
generated 2 port counts for geohash 75b
Fit HDBScan for 75c/ 440 data points
generated 3 port counts for geohash 75c
Fit HDBScan for 75f/ 106 data points
Fit HDBScan for 7h4/ 2620 data points
generated 2 port counts for geohash 7h4
Fit HDBScan for 7h5/ 156 data points
Fit HDBScan for 7h7/ 27710 data points
generated 2 port counts for geohash 7h7
Fit HDBScan for 7p8/ 69855 data points
generated 2 port counts for geohash 7p8
Fit HDBScan for 9ep/ 5267 data points
generated 2 port counts for geohash

generated 2 port counts for geohash thu
Fit HDBScan for thx/ 170 data points
Fit HDBScan for tk2/ 2587 data points
generated 2 port counts for geohash tk2
Fit HDBScan for tk8/ 176 data points
Fit HDBScan for ts1/ 11810 data points
generated 3 port counts for geohash ts1
Fit HDBScan for u0b/ 148 data points
Fit HDBScan for u11/ 1613 data points
generated 2 port counts for geohash u11
Fit HDBScan for u13/ 746 data points
generated 2 port counts for geohash u13
Fit HDBScan for u14/ 12017 data points
generated 3 port counts for geohash u14
Fit HDBScan for u15/ 166 data points
Fit HDBScan for u16/ 1606 data points
generated 2 port counts for geohash u16
Fit HDBScan for u17/ 2129 data points
generated 2 port counts for geohash u17
Fit HDBScan for u18/ 1470 data points
generated 2 port counts for geohash u18
Fit HDBScan for u1t/ 712 data points
generated 2 port counts for geohash u1t
Fit HDBScan for u1x/ 467 data points
generated 2 port counts for geohash u1x
Fit HDBScan for u1y/ 317 data poi