In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import inshallah as ins
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from geopy.distance import distance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [176]:
def floor_to_nearest_5th_day(date, start_date=pd.Timestamp("2024-01-01")):
    days_since_start = (date - start_date).days
    floored_days = 5 * np.floor(days_since_start / 5)
    nearest_5th_day = start_date + pd.Timedelta(days=floored_days)
    return nearest_5th_day

def normalize_lat_lon(lat, lon):
    """
    Normalizes extreme latitude and longitude values, ensuring:
    - Latitude is constrained between [-90, 90] with appropriate longitude shifts.
    - Longitude is wrapped to be within [-180, 180].
    """
    
    # Handle extreme latitude values
    while lat > 90 or lat < -90:
        if lat > 90:
            lat = 180 - lat
            lon += 180
        elif lat < -90:
            lat = -180 - lat
            lon += 180
    
    # Normalize longitude using modulo to bring it within [-180, 180]
    lon = ((lon + 180) % 360) - 180
    
    return lat, lon

In [113]:
data = pd.read_csv("../CSV/big_files/ais_train.csv", sep="|")
vessels = pd.read_csv("../CSV/vessels.csv", sep="|")
test_og = pd.read_csv("../CSV/ais_test.csv")
cleaned = pd.read_csv("cleaned.csv", sep="|")

cleaned["time"] = pd.to_datetime(cleaned["time"])

cleaned.head()

Unnamed: 0.1,Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,...,time_cum,delta_lat,delta_lon,delta_lat_cum,delta_lon_cum,speed_from_prev,dist_from_prev,dist_cum,delta_lat_lag_1,delta_lon_lag_1
0,83929,2024-01-01 00:37:39,0.778417,0.0,0,1.518436,5,12-31 13:00,39.21009,9.10328,...,1438.0,2e-05,-2e-05,4e-05,-4e-05,0.057871,0.002813,0.005626,2e-05,-2e-05
1,274461,2024-01-01 00:38:37,3.591888,10.1,0,0.069813,0,01-02 01:00,41.44703,-71.38101,...,1397.0,-0.02108,-0.0173,-0.07102,-0.04084,18.653912,2.751452,8.635989,-0.04994,-0.02354
2,219988,2024-01-01 00:43:14,1.019272,11.8,0,2.199115,0,01-01 05:00,50.51531,1.00915,...,1719.0,0.01129,0.03266,0.02888,0.15068,22.587815,2.635245,11.235098,0.01759,0.11802
3,19824,2024-01-01 00:44:36,1.998402,0.0,0,2.024582,5,01-03 13:30,43.4424,-3.82321,...,2161.0,3e-05,-0.00012,2e-05,-5e-05,0.025658,0.01027,0.016045,-1e-05,7e-05
4,35745,2024-01-01 00:46:32,4.53262,0.0,-1,4.171337,5,12-25 15:00,40.72221,29.48062,...,2161.0,-1e-05,-1e-05,0.0,-1e-05,0.004651,0.001395,0.002506,1e-05,0.0


In [None]:
for i in range(2,6):
    cleaned[f"delta_lat_lag_{i}"] = cleaned[f"delta_lat_lag_{i-1}"].shift(1)
    cleaned[f"delta_lon_lag_{i}"] = cleaned[f"delta_lon_lag_{i-1}"].shift(1)

cleaned["delta_lat_cum_lag_1"] = cleaned["delta_lat_cum"].shift(1)
cleaned["delta_lon_cum_lag_1"] = cleaned["delta_lon_cum"].shift(1)

cleaned["day"] = cleaned["time"].apply(lambda t: t.day)
cleaned["month"] = cleaned["time"].apply(lambda t: t.month)
cleaned["hour"] = cleaned["time"].apply(lambda t: t.hour)

cleaned["v_lon"] = cleaned["sog"] * np.cos(cleaned["cog"])
cleaned["v_lat"] = cleaned["sog"] * np.sin(cleaned["cog"])


cleaned.dropna(inplace=True)
cleaned.drop(columns=["Unnamed: 0"], inplace=True)
cleaned.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,delta_lon_lag_3,delta_lat_lag_4,delta_lon_lag_4,delta_lat_lag_5,delta_lon_lag_5,delta_lat_cum_lag_1,delta_lon_cum_lag_1,day,month,hour
12,2024-01-01 00:53:54,5.539675,12.5,0,0.994838,0,10-31 11:30,18.91427,-66.46253,61e9f3aab937134a3c4bfe13,...,2e-05,1e-05,-1e-05,4e-05,-2e-05,0.09805,-0.14146,1,1,0
13,2024-01-01 00:54:19,4.452335,0.0,-1,3.595378,5,12-31 04:00,50.88489,-1.39689,61e9f3adb937134a3c4bfe35,...,-0.07757,0.0,2e-05,1e-05,-1e-05,0.10795,-0.0988,1,1,0
14,2024-01-01 00:54:29,0.270526,0.0,-2,3.787364,1,12-31 15:00,8.89782,-79.46933,61e9f3d3b937134a3c4bff1b,...,-0.04734,0.05337,-0.07757,0.0,2e-05,1e-05,0.0,1,1,0
15,2024-01-01 00:54:29,0.628319,0.0,0,4.834562,5,12-30 19:55,35.46923,139.68347,61e9f3e6b937134a3c4bff6d,...,1e-05,0.05194,-0.04734,0.05337,-0.07757,2e-05,2e-05,1,1,0
16,2024-01-01 00:54:31,6.021386,0.0,0,3.577925,5,12-28 17:00,54.37932,18.65969,clh6aqawa0004gh0z12aogec9,...,0.0,0.0,1e-05,0.05194,-0.04734,1e-05,4e-05,1,1,0


In [168]:
n = 50
small = cleaned[cleaned["vesselId"].isin(cleaned["vesselId"].unique()[0:n+1])].copy()

small["last_5_day"] = small["time"].map(floor_to_nearest_5th_day)
small["delta_last_5th"] = (small["time"] - small["last_5_day"]).dt.total_seconds()
small.drop(columns=["last_5_day"], inplace=True)

#Velocity
small["v_lon"] = small["sog"] * np.cos(small["cog"])
small["v_lat"] = small["sog"] * np.sin(small["cog"])

small.dropna(inplace=True)

small.tail()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,delta_lat_lag_5,delta_lon_lag_5,delta_lat_cum_lag_1,delta_lon_cum_lag_1,day,month,hour,delta_last_5th,v_lon,v_lat
1402281,2024-04-28 08:35:52,2.146755,0.1,0,4.101524,1,04-28 03:30,35.43092,139.73178,61e9f430b937134a3c4c010d,...,1e-05,1e-05,8.61126,89.68975,28,4,8,290152.0,-0.054464,0.083867
1402322,2024-04-28 08:37:53,0.947714,20.5,0,5.061455,0,04-29 03:30,56.07076,16.74527,61e9f464b937134a3c4c0265,...,0.00477,0.00861,44.73489,88.01285,28,4,8,290273.0,11.962595,16.647712
1402323,2024-04-28 08:37:55,1.989675,0.0,0,1.989675,5,04-22 19:25,39.45465,-0.32439,61e9f467b937134a3c4c0281,...,0.06042,-0.03239,0.74051,2.17483,28,4,8,290275.0,-0.0,0.0
1402326,2024-04-28 08:38:04,4.323181,0.0,0,5.794493,5,04-28 04:00,47.29326,-2.1604,61e9f45eb937134a3c4c0237,...,0.05993,0.18078,-5.08816,-11.15579,28,4,8,290284.0,-0.0,-0.0
1402335,2024-04-28 08:38:23,2.523746,13.0,0,1.204277,0,05-10 06:50,35.26703,139.7646,61e9f46cb937134a3c4c02b7,...,3e-05,-5e-05,1.42258,6.0416,28,4,8,290303.0,-10.596661,7.530655


In [None]:
LAT_FEATURES = ["delta_last_5th", "v_lat", "v_lon", "delta_lat_lag_1",
                "delta_time", "speed_from_prev"]
LON_FEATURES = ["delta_last_5th", "v_lat", "v_lon", "delta_lon_lag_1",
                "delta_time", "dist_from_prev", "hour", "day"]

for i in range(2,6):
    LAT_FEATURES.append(f"delta_lat_lag_{i}")
    LON_FEATURES.append(f"delta_lon_lag_{i}")

X_lat = small[LAT_FEATURES].copy()
X_lon = small[LAT_FEATURES].copy()

X_lat_train, X_lat_test, y_lat_train, y_lat_test = train_test_split(X_lat, small[["delta_lat"]], test_size=0.05, shuffle=False)
X_lon_train, X_lon_test, y_lon_train, y_lon_test = train_test_split(X_lon, small[["delta_lon", "delta_lon_cum"]], test_size=0.05, shuffle=False)

latModel = RandomForestRegressor(n_estimators=100, max_depth=6, verbose=1)
lonModel = RandomForestRegressor(n_estimators=100, max_depth=6, verbose=1)

latModel.fit(X_lat_train, y_lat_train)
lonModel.fit(X_lon_train, y_lon_train)

lat_pred = latModel.predict(X_lat_test)
lon_pred = lonModel.predict(X_lon_test)

y_lat_test = y_lat_test["delta_lat"].copy()
y_lon_test = y_lon_test["delta_lon"].copy()

print(f"Latitude MSE: {mean_squared_error(y_lat_test, lat_pred)}")
print(f"Longitude MSE: {mean_squared_error(y_lon_test, lon_pred[:,0])}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   39.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   44.0s


Latitude MSE: 1.075287605249802
Longitude MSE: 21.69644024278613


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [185]:
stats = {}
test = test_og.copy()

for i in test["vesselId"].unique()[0:5]:
    boat = cleaned[cleaned["vesselId"] == i]

    boat_stats = {}

    boat_stats["last_pos"] = boat.iloc[-1][["latitude", "longitude"]]
    boat_stats["last_5th_day"] = floor_to_nearest_5th_day(boat.iloc[-1]["time"])
    boat_stats["v_lat"], boat_stats["v_lon"] = boat.iloc[-1]["v_lat"], boat.iloc[-1]["v_lon"]
    boat_stats["delta_lat_cum_lag_1"] = boat.iloc[-1]["delta_lat_cum_lag_1"]

    for j in range(2,6):
        boat_stats[f"delta_lat_lag_{j}"] = boat.iloc[-j]["delta_lat"]

    stats[i] = boat_stats

print(stats)


KeyError: 'v_lat'

['delta_last_5th', 'v_lat', 'v_lon', 'delta_lat_lag_1', 'delta_time', 'delta_lat_cum_lag_1', 'delta_lat_lag_2', 'delta_lat_lag_3', 'delta_lat_lag_4', 'delta_lat_lag_5']
