In [1]:
import pandas as pd
from shapely.geometry import Point, Polygon
import numpy as np
from time import time


In [2]:
file_path = 'data/AIS_2019_01_01/AIS_2019_01_01.csv'
df = pd.read_csv(file_path)
df = df[['MMSI','VesselType','BaseDateTime','LAT', 'LON']]
df = df.dropna()

In [3]:
polygon = pd.read_csv('Gulf_polygon.csv')
coords = list(zip(polygon['LAT'],polygon['LON']))
polygon = Polygon(coords)

In [None]:
lats = df['LAT'].values
lons = df['LON'].values

start = time()
# Vectorized operation
from shapely import points, contains

# build all Point geometries at once
# pt_geoms = points(lons, lats)        # returns a GeometryArray
pt_geoms = points(lats, lons)        # returns a GeometryArray

# test containment in one shot
inside_mask = contains(polygon, pt_geoms)

end = time()
print("Time taken for vectorized polygon containment test:", end - start)
print(sum(inside_mask))

Time taken for vectorized polygon containment test: 37.843074560165405
349880


In [5]:
df = df[inside_mask]

In [6]:
df['dt'] = pd.to_datetime(df['BaseDateTime'])

In [7]:
df['tod'] = df['dt'] - df['dt'].dt.normalize()

In [8]:
df['elapsed'] = df.groupby('MMSI')['tod'].transform(lambda x: x - x.min())

In [9]:
df['elapsed_s'] = df['elapsed'].dt.total_seconds().astype(int)

In [10]:
df

Unnamed: 0,MMSI,VesselType,BaseDateTime,LAT,LON,dt,tod,elapsed,elapsed_s
29,403638000,80.0,2019-01-01T00:00:01,28.52015,-94.34369,2019-01-01 00:00:01,0 days 00:00:01,0 days 00:00:00,0
75,477669300,80.0,2019-01-01T00:00:00,28.86955,-93.91747,2019-01-01 00:00:00,0 days 00:00:00,0 days 00:00:00,0
102,366991740,50.0,2019-01-01T00:00:00,29.34033,-94.70212,2019-01-01 00:00:00,0 days 00:00:00,0 days 00:00:00,0
104,367446970,90.0,2019-01-01T00:00:04,27.37317,-94.47767,2019-01-01 00:00:04,0 days 00:00:04,0 days 00:00:00,0
115,367523620,60.0,2019-01-01T00:00:01,28.93352,-90.02459,2019-01-01 00:00:01,0 days 00:00:01,0 days 00:00:00,0
...,...,...,...,...,...,...,...,...,...
7509514,370939000,70.0,2019-01-01T23:59:48,28.90883,-89.34866,2019-01-01 23:59:48,0 days 23:59:48,0 days 23:57:01,86221
7509532,366942880,90.0,2019-01-01T23:59:49,28.64266,-88.96472,2019-01-01 23:59:49,0 days 23:59:49,0 days 21:56:10,78970
7509580,366825080,30.0,2019-01-01T23:59:54,29.17062,-89.74911,2019-01-01 23:59:54,0 days 23:59:54,0 days 23:59:52,86392
7509611,367481540,31.0,2019-01-01T23:59:56,30.28625,-87.99906,2019-01-01 23:59:56,0 days 23:59:56,0 days 12:44:30,45870


In [11]:
df_collapsed = (
    df
    .groupby(['MMSI','VesselType'], sort=False)
    .agg({
        'elapsed_s':    list,
        'LAT':          list,
        'LON':          list
    })
    .reset_index()
)

In [12]:
df_collapsed

Unnamed: 0,MMSI,VesselType,elapsed_s,LAT,LON
0,403638000,80.0,"[0, 65, 132, 195, 259, 322, 384, 445, 507, 568...","[28.52015, 28.52013, 28.52012, 28.52012, 28.52...","[-94.34369, -94.34368, -94.34363, -94.34361, -..."
1,477669300,80.0,"[0, 70, 170, 240, 309, 379, 449, 530, 599, 679...","[28.86955, 28.87175, 28.8749, 28.87711, 28.879...","[-93.91747, -93.92123, -93.92656, -93.93033, -..."
2,366991740,50.0,"[0, 62, 124, 188, 270, 332, 412, 538, 474, 608...","[29.34033, 29.3375, 29.33395, 29.33023, 29.325...","[-94.70212, -94.69351, -94.68535, -94.67694, -..."
3,367446970,90.0,"[0, 61, 130, 221, 291, 370, 442, 511, 582, 651...","[27.37317, 27.373, 27.37317, 27.37317, 27.373,...","[-94.47767, -94.47767, -94.47767, -94.47767, -..."
4,367523620,60.0,"[0, 65, 185, 400, 1425, 1574, 1665, 1785, 1855...","[28.93352, 28.93409, 28.93398, 28.93347, 28.93...","[-90.02459, -90.02371, -90.02301, -90.02217, -..."
...,...,...,...,...,...
843,636017478,70.0,[0],[27.30509],[-88.32298]
844,366967490,31.0,[0],[28.37056],[-90.25273]
845,366978490,90.0,"[0, 220, 290, 379, 449, 511, 589, 679, 769, 86...","[29.0737, 29.06605, 29.06347, 29.0603, 29.0576...","[-90.23008, -90.2325, -90.23264, -90.23211, -9..."
846,367438530,60.0,"[0, 242, 176]","[29.0734, 29.05191, 29.0558]","[-90.22904, -90.22213, -90.22666]"


In [13]:
df_collapsed['Label'] = (df_collapsed['VesselType'] == 37).astype(int)

In [15]:
df_collapsed.drop(columns=['VesselType','MMSI'], inplace=True)

In [16]:
df_collapsed

Unnamed: 0,elapsed_s,LAT,LON,Label
0,"[0, 65, 132, 195, 259, 322, 384, 445, 507, 568...","[28.52015, 28.52013, 28.52012, 28.52012, 28.52...","[-94.34369, -94.34368, -94.34363, -94.34361, -...",0
1,"[0, 70, 170, 240, 309, 379, 449, 530, 599, 679...","[28.86955, 28.87175, 28.8749, 28.87711, 28.879...","[-93.91747, -93.92123, -93.92656, -93.93033, -...",0
2,"[0, 62, 124, 188, 270, 332, 412, 538, 474, 608...","[29.34033, 29.3375, 29.33395, 29.33023, 29.325...","[-94.70212, -94.69351, -94.68535, -94.67694, -...",0
3,"[0, 61, 130, 221, 291, 370, 442, 511, 582, 651...","[27.37317, 27.373, 27.37317, 27.37317, 27.373,...","[-94.47767, -94.47767, -94.47767, -94.47767, -...",0
4,"[0, 65, 185, 400, 1425, 1574, 1665, 1785, 1855...","[28.93352, 28.93409, 28.93398, 28.93347, 28.93...","[-90.02459, -90.02371, -90.02301, -90.02217, -...",0
...,...,...,...,...
843,[0],[27.30509],[-88.32298],0
844,[0],[28.37056],[-90.25273],0
845,"[0, 220, 290, 379, 449, 511, 589, 679, 769, 86...","[29.0737, 29.06605, 29.06347, 29.0603, 29.0576...","[-90.23008, -90.2325, -90.23264, -90.23211, -9...",0
846,"[0, 242, 176]","[29.0734, 29.05191, 29.0558]","[-90.22904, -90.22213, -90.22666]",0


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



# Feature extraction function
def extract_features(row):
    elapsed = np.array(row['elapsed_s'])
    lat = np.array(row['LAT'])
    lon = np.array(row['LON'])
    
    # Calculate duration
    duration = elapsed[-1] - elapsed[0]
    
    # Calculate simple speed approximations
    dlat = np.diff(lat)
    dlon = np.diff(lon)
    dt = np.diff(elapsed) + 1e-6  # prevent division by zero
    speeds = np.sqrt(dlat**2 + dlon**2) / dt
    
    return pd.Series({
        'duration': duration,
        'lat_mean': lat.mean(),
        'lat_std': lat.std(),
        'lon_mean': lon.mean(),
        'lon_std': lon.std(),
        'speed_mean': speeds.mean(),
        'speed_std': speeds.std(),
        'seq_len': len(elapsed)
    })

# Apply feature extraction
features = df_collapsed.apply(extract_features, axis=1)

# Prepare train/test sets
X = features
y = df_collapsed['Label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Train a Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.99      1.00      1.00       169
           1       0.00      0.00      0.00         1

    accuracy                           0.99       170
   macro avg       0.50      0.50      0.50       170
weighted avg       0.99      0.99      0.99       170



  'speed_mean': speeds.mean(),
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
X

Unnamed: 0,duration,lat_mean,lat_std,lon_mean,lon_std,speed_mean,speed_std,seq_len
0,86211.0,28.527101,0.017315,-94.350400,0.013007,1.325314e-06,0.000004,1107.0
1,16699.0,29.133645,0.139790,-94.344262,0.226149,5.114449e-05,0.000027,188.0
2,78808.0,29.313937,0.019595,-94.642196,0.038657,7.888070e-05,0.000078,237.0
3,81281.0,27.372836,0.000339,-94.477967,0.000854,1.409945e-07,0.000001,745.0
4,86076.0,28.960557,0.021627,-90.021407,0.036006,2.711365e-05,0.000033,403.0
...,...,...,...,...,...,...,...,...
843,0.0,27.305090,0.000000,-88.322980,0.000000,,,1.0
844,0.0,28.370560,0.000000,-90.252730,0.000000,,,1.0
845,1751.0,29.041168,0.017145,-90.219438,0.011128,3.640804e-05,0.000012,21.0
846,176.0,29.060370,0.009349,-90.225943,0.002866,1.404755e-06,0.000092,3.0
