In [1]:
import pandas as pd
from shapely.geometry import Point, Polygon
import numpy as np
from time import time
import os


In [2]:
file_path = 'data/AIS_2019_01_01/AIS_2019_01_01.csv'
df = pd.read_csv(file_path)
df = df[['MMSI','VesselType','BaseDateTime','LAT', 'LON']]
df = df.dropna()

In [3]:
polygon = pd.read_csv('Gulf_polygon.csv')
coords = list(zip(polygon['LAT'],polygon['LON']))
polygon = Polygon(coords)

In [None]:
lats = df['LAT'].values
lons = df['LON'].values

start = time()
# Vectorized operation
from shapely import points, contains

# build all Point geometries at once
# pt_geoms = points(lons, lats)        # returns a GeometryArray
pt_geoms = points(lats, lons)        # returns a GeometryArray

# test containment in one shot
inside_mask = contains(polygon, pt_geoms)

end = time()
print("Time taken for vectorized polygon containment test:", end - start)
print(sum(inside_mask))

Time taken for vectorized polygon containment test: 37.843074560165405
349880


In [5]:
df = df[inside_mask]

In [6]:
df['dt'] = pd.to_datetime(df['BaseDateTime'])

In [7]:
df['tod'] = df['dt'] - df['dt'].dt.normalize()

In [8]:
df['elapsed'] = df.groupby('MMSI')['tod'].transform(lambda x: x - x.min())

In [9]:
df['elapsed_s'] = df['elapsed'].dt.total_seconds().astype(int)

In [2]:
import os
import pandas as pd

folder = '../AIS_data'
files = os.listdir(folder)

min_data_points = 100

all_df = pd.DataFrame()
for file in files:
    if file.endswith('.pkl'):
        df = pd.read_pickle(os.path.join(folder, file))
        df = df[df['LAT'].apply(lambda x: len(x) >= min_data_points)]  # filter short elapsed rows
        all_df = pd.concat([all_df, df], ignore_index=True)


In [3]:
len(files)

365

In [4]:
all_df.head()

Unnamed: 0,MMSI,elapsed_s,LAT,LON,VesselType,Label
0,367659930,"[0, 69, 138, 210, 279, 349, 420, 488, 570, 639...","[30.4289, 30.42774, 30.42675, 30.42573, 30.424...","[-87.99302, -87.99258, -87.9919, -87.99129, -8...",31.0,0
1,367553360,"[0, 70, 149, 220, 301, 379, 449, 559, 630, 709...","[29.01648, 29.01666, 29.01664, 29.01661, 29.01...","[-91.83069, -91.83175, -91.83304, -91.83416, -...",30.0,0
2,367461560,"[0, 64, 134, 204, 274, 354, 425, 495, 45847, 4...","[29.36644, 29.36952, 29.37291, 29.3762, 29.379...","[-91.38801, -91.38525, -91.38225, -91.37948, -...",90.0,0
3,538007067,"[0, 120, 301, 483, 662, 843, 1023, 1204, 1383,...","[28.82299, 28.82325, 28.8236, 28.82388, 28.824...","[-89.33299, -89.33357, -89.3343, -89.33491, -8...",70.0,0
4,369053000,"[0, 70, 139, 209, 270, 340, 410, 473, 540, 610...","[30.18058, 30.17847, 30.17641, 30.17425, 30.17...","[-88.56405, -88.56745, -88.57083, -88.57432, -...",90.0,0


In [5]:
all_df.drop(columns=['MMSI', 'VesselType'], inplace=True)


In [6]:
all_df

Unnamed: 0,elapsed_s,LAT,LON,Label
0,"[0, 69, 138, 210, 279, 349, 420, 488, 570, 639...","[30.4289, 30.42774, 30.42675, 30.42573, 30.424...","[-87.99302, -87.99258, -87.9919, -87.99129, -8...",0
1,"[0, 70, 149, 220, 301, 379, 449, 559, 630, 709...","[29.01648, 29.01666, 29.01664, 29.01661, 29.01...","[-91.83069, -91.83175, -91.83304, -91.83416, -...",0
2,"[0, 64, 134, 204, 274, 354, 425, 495, 45847, 4...","[29.36644, 29.36952, 29.37291, 29.3762, 29.379...","[-91.38801, -91.38525, -91.38225, -91.37948, -...",0
3,"[0, 120, 301, 483, 662, 843, 1023, 1204, 1383,...","[28.82299, 28.82325, 28.8236, 28.82388, 28.824...","[-89.33299, -89.33357, -89.3343, -89.33491, -8...",0
4,"[0, 70, 139, 209, 270, 340, 410, 473, 540, 610...","[30.18058, 30.17847, 30.17641, 30.17425, 30.17...","[-88.56405, -88.56745, -88.57083, -88.57432, -...",0
...,...,...,...,...
296782,"[0, 224, 81, 301, 401, 510, 581, 651, 750, 831...","[28.91509, 28.90624, 28.91185, 28.90286, 28.89...","[-89.42226, -89.43059, -89.42518, -89.43154, -...",0
296783,"[0, 69, 130, 337, 440, 620, 811, 681, 891, 104...","[28.91911, 28.91597, 28.91333, 28.90537, 28.90...","[-89.4197, -89.422, -89.4239, -89.43129, -89.4...",0
296784,"[0, 100, 230, 309, 391, 480, 741, 811, 879, 90...","[29.07654, 29.07338, 29.06873, 29.06566, 29.06...","[-90.22863, -90.22913, -90.22987, -90.23059, -...",0
296785,"[90, 0, 161, 307, 433, 240, 565, 636, 715, 498...","[30.63707, 30.64235, 30.63251, 30.62283, 30.61...","[-88.03235, -88.03225, -88.03236, -88.03265, -...",0


In [10]:
all_df['Label'].value_counts()

Label
0    294263
1      2524
Name: count, dtype: int64

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



# Feature extraction function
def extract_features(row):
    elapsed = np.array(row['elapsed_s'])
    lat = np.array(row['LAT'])
    lon = np.array(row['LON'])

    
    
    # Calculate duration
    duration = elapsed[-1] - elapsed[0]
    
    # Calculate simple speed approximations
    dlat = np.diff(lat)
    dlon = np.diff(lon)
    dt = np.diff(elapsed) + 1e-6  # prevent division by zero
    speeds = np.sqrt(dlat**2 + dlon**2) / dt
    
    return pd.Series({
        'duration': duration,
        'lat_mean': lat.mean(),
        'lat_std': lat.std(),
        'lon_mean': lon.mean(),
        'lon_std': lon.std(),
        'speed_mean': speeds.mean(),
        'speed_std': speeds.std(),
        # 'seq_len': len(elapsed)
    })

# Apply feature extraction
features = all_df.apply(extract_features, axis=1)

# Prepare train/test sets
X = features
y = all_df['Label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     58853
           1       0.88      0.44      0.59       505

    accuracy                           0.99     59358
   macro avg       0.94      0.72      0.79     59358
weighted avg       0.99      0.99      0.99     59358



In [14]:
X

Unnamed: 0,duration,lat_mean,lat_std,lon_mean,lon_std,speed_mean,speed_std
0,17389.0,30.324618,0.047122,-87.903259,0.068003,1.520286e-05,9.880400e-06
1,83029.0,28.953475,0.077996,-91.502626,0.224377,1.008802e-05,1.387394e-05
2,86234.0,29.213490,0.079397,-91.404074,0.028022,1.387749e-05,3.094376e-05
3,86349.0,28.822487,0.001178,-89.333770,0.001293,4.915938e-07,5.158963e-07
4,86361.0,30.171884,0.011064,-88.744727,0.131171,2.642389e-05,3.479417e-05
...,...,...,...,...,...,...,...
296782,12320.0,28.646663,0.170977,-89.308369,0.104131,3.982501e-05,4.308371e-05
296783,12200.0,28.624968,0.181957,-89.325142,0.079961,4.241511e-05,3.870482e-05
296784,11641.0,28.865848,0.107478,-90.096276,0.116995,3.948535e-05,2.984727e-05
296785,11442.0,30.303276,0.195597,-88.062989,0.060417,5.285400e-05,3.847035e-05


In [15]:
importances = clf.feature_importances_
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
print(importance_df)

      Feature  Importance
3    lon_mean    0.198640
1    lat_mean    0.176354
6   speed_std    0.149770
4     lon_std    0.128213
5  speed_mean    0.127526
2     lat_std    0.119704
0    duration    0.099794
