In [271]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import inshallah as ins
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from geopy.distance import distance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv("../CSV/big_files/ais_train.csv", sep="|")
test = pd.read_csv("../CSV/ais_test.csv", sep=",")

test["time"] = pd.to_datetime(test["time"])

data["time"] = pd.to_datetime(data["time"])
data["sog"] = data["sog"]*1.944 #Knots to m/s
data["cog"] = np.pi * data["cog"] / 180

test.tail()

Unnamed: 0,ID,vesselId,time,scaling_factor
51734,51734,61e9f3a8b937134a3c4bfdf3,2024-05-12 23:59:58,0.1
51735,51735,61e9f3b4b937134a3c4bfe77,2024-05-12 23:59:58,0.1
51736,51736,61e9f46cb937134a3c4c02b7,2024-05-12 23:59:58,0.1
51737,51737,61e9f465b937134a3c4c0269,2024-05-12 23:59:58,0.1
51738,51738,61e9f3adb937134a3c4bfe39,2024-05-12 23:59:58,0.1


In [532]:
def floor_to_nearest_5th_day(date, start_date=pd.Timestamp("2024-01-01")):
    days_since_start = (date - start_date).days
    floored_days = 5 * np.floor(days_since_start / 5)
    nearest_5th_day = start_date + pd.Timedelta(days=floored_days)
    return nearest_5th_day

def calc_vx(row):
    return row["sog"]*np.cos(row["cog"])

def calc_vy(row):
    return row["sog"]*np.sin(row["cog"])

def normalize_lat_lon(lat, lon):
    """
    Normalizes extreme latitude and longitude values, ensuring:
    - Latitude is constrained between [-90, 90] with appropriate longitude shifts.
    - Longitude is wrapped to be within [-180, 180].
    """
    
    # Handle extreme latitude values
    while lat > 90 or lat < -90:
        if lat > 90:
            lat = 180 - lat
            lon += 180
        elif lat < -90:
            lat = -180 - lat
            lon += 180
    
    # Normalize longitude using modulo to bring it within [-180, 180]
    lon = ((lon + 180) % 360) - 180
    
    return lat, lon

In [None]:
pro = []
count = 0

for i in data["vesselId"].unique():
    count += 1

    boat = data[data["vesselId"] == i].copy().reset_index(drop=True)
    
    #Deltas
    boat["delta_time"] = (boat["time"] - boat["time"].shift(1)).dt.total_seconds()

    boat["delta_lat"] = boat["latitude"] - boat["latitude"].shift(1)
    boat["delta_lon"] = boat["longitude"] - boat["longitude"].shift(1)

    #5 Day Intervals
    boat["5d_interval"] = boat["time"].map(floor_to_nearest_5th_day)
    boat["delta_5d"] = (boat["time"] - boat["5d_interval"]).dt.total_seconds()
    boat["new_int"] = (boat["5d_interval"] != boat["5d_interval"].shift(-1))

    # Velocities and Wierd ahh lags
    boat["v_x"] = boat["sog"] * np.cos(boat["cog"])
    boat["v_y"] = boat["sog"] * np.sin(boat["cog"])
    boat["delta_lat_lag"] = np.nan
    boat["delta_lon_lag"] = np.nan
    boat["lat_lag"] = np.nan
    boat["lon_lag"] = np.nan

    for j,row in boat.iterrows():
        if row["new_int"] and j != 0:
            boat.at[j, "v_x"] = calc_vx(row)
            boat.at[j, "v_y"] = calc_vy(row)

            boat.at[j, "delta_lat_lag"] = boat.at[j-1, "delta_lat"]
            boat.at[j, "delta_lon_lag"] = boat.at[j-1, "delta_lon"]

            boat.at[j, "lat_lag"] = boat.at[j-1, "latitude"]
            boat.at[j, "lon_lag"] = boat.at[j-1, "longitude"]

    boat["v_x"].ffill(inplace=True)
    boat["v_y"].ffill(inplace=True)
    boat["delta_lat_lag"].ffill(inplace=True)
    boat["delta_lon_lag"].ffill(inplace=True)
    boat["lat_lag"].ffill(inplace=True)
    boat["lon_lag"].ffill(inplace=True)

    boat["v_x"].bfill(inplace=True)
    boat["v_y"].bfill(inplace=True)
    boat["delta_lat_lag"].bfill(inplace=True)
    boat["delta_lon_lag"].bfill(inplace=True)
    boat["lat_lag"].bfill(inplace=True)
    boat["lon_lag"].bfill(inplace=True)

    #Cleaning
    boat.dropna(inplace=True)
    boat.drop(columns=["5d_interval", "new_int", "heading", "rot", "navstat", "etaRaw", "portId"], inplace=True)

    #Recreate OG dataframe
    for _,rows in boat.iterrows():
        pro.append(rows.to_dict())


processed = pd.DataFrame(pro)
processed.sort_values("time", inplace=True)
processed.to_csv("processed.csv", sep="|")
processed.head()

KeyboardInterrupt: 

In [None]:
processed = pd.read_csv("processed.csv", sep="|")
processed["time"] = pd.to_datetime(processed["time"])
processed.drop(columns=["Unnamed: 0"], inplace=True)

processed[processed["vesselId"] == "61e9f3aeb937134a3c4bfe3d"][50:100].head()

Unnamed: 0,time,cog,sog,latitude,longitude,vesselId,delta_time,delta_lat,delta_lon,delta_5d,v_x,v_y,delta_lat_lag,delta_lon_lag,lat_lag,lon_lag
208776,2024-01-19 14:57:07,3.996804,26.8272,-17.33788,-149.58229,61e9f3aeb937134a3c4bfe3d,1261.0,-0.01565,-0.07867,313027.0,-17.600227,-20.246745,-0.01434,0.01265,8.80894,-79.53719
208963,2024-01-19 15:18:20,3.246312,22.7448,-17.4063,-149.62208,61e9f3aeb937134a3c4bfe3d,1273.0,-0.06842,-0.03979,314300.0,-22.620202,-2.377479,-0.01434,0.01265,8.80894,-79.53719
209060,2024-01-19 15:25:20,3.263766,12.2472,-17.42397,-149.62541,61e9f3aeb937134a3c4bfe3d,420.0,-0.01767,-0.00333,314720.0,-12.155911,-1.492558,-0.01434,0.01265,8.80894,-79.53719
216451,2024-01-20 05:56:50,4.014257,30.132,-17.70921,-149.87776,61e9f3aeb937134a3c4bfe3d,52290.0,-0.28524,-0.25235,367010.0,-19.368476,-23.082451,-0.01434,0.01265,8.80894,-79.53719
216590,2024-01-20 06:16:56,4.101524,28.9656,-17.76261,-149.94901,61e9f3aeb937134a3c4bfe3d,1206.0,-0.0534,-0.07125,368216.0,-16.613986,-23.72723,-0.01434,0.01265,8.80894,-79.53719


In [529]:
FEATURES = ["delta_time", "delta_5d", "delta_lat_lag", "delta_lon_lag", "lat_lag", "lon_lag", "v_x", "v_y"]
print(FEATURES)

n = 36129
n_rows = processed[FEATURES].shape[0]
train, test = processed.head(n_rows-n), processed.tail(n)
y_test = test[["delta_lat", "delta_lon"]]

print("Starting XGBoost")
xgb = XGBRegressor(n_estimators=3000)
xgb.fit(processed[FEATURES], processed[["delta_lat", "delta_lon"]])

# xgb_pred = xgb.predict(test[FEATURES])
# print(f"MSE: {mean_squared_error(y_test, xgb_pred)}")

['delta_time', 'delta_5d', 'delta_lat_lag', 'delta_lon_lag', 'lat_lag', 'lon_lag', 'v_x', 'v_y']
Starting XGBoost


In [409]:
# nga[["longitude_predicted", "latitude_predicted"]].to_csv("res.csv")
print(FEATURES)

['delta_time', 'delta_5d', 'delta_lat_lag', 'delta_lon_lag', 'lat_lag', 'lon_lag', 'v_x', 'v_y']


In [546]:
#Iterative setup
test = pd.read_csv("../CSV/ais_test.csv", sep=",")
test["time"] = pd.to_datetime(test["time"])

stats = {}
n = 3

for i in test["vesselId"].unique():
    trained = processed[processed["vesselId"] == i]
    boat = test[test["vesselId"] == i].reset_index(drop=True)
    
    last_5d = floor_to_nearest_5th_day(boat.iloc[0]["time"])
    last_time = trained.iloc[-1]["time"]

    delta_lat_lag = trained.iloc[-1]["delta_lat_lag"]
    delta_lon_lag = trained.iloc[-1]["delta_lon_lag"]

    lat_lag = trained.iloc[-1]["lat_lag"]
    lon_lag = trained.iloc[-1]["lon_lag"]

    v_x = trained.iloc[-1]["sog"] * np.cos(trained.iloc[-1]["cog"])
    v_y = trained.iloc[-1]["sog"] * np.sin(trained.iloc[-1]["cog"])

    stats[i] = {"last_5d": last_5d, "last_time": last_time, "delta_lat_lag": delta_lat_lag,
                "delta_lon_lag": delta_lon_lag, "lat_lag": lat_lag, "lon_lag": lon_lag,
                "v_x": v_x, "v_y": v_y}

print(stats)


{'61e9f3aeb937134a3c4bfe3d': {'last_5d': Timestamp('2024-05-05 00:00:00'), 'last_time': Timestamp('2024-05-07 23:48:16'), 'delta_lat_lag': 0.0, 'delta_lon_lag': 0.0, 'lat_lag': 31.14648, 'lon_lag': -81.49789, 'v_x': -0.0, 'v_y': 0.0}, '61e9f473b937134a3c4c02df': {'last_5d': Timestamp('2024-05-05 00:00:00'), 'last_time': Timestamp('2024-05-07 23:57:16'), 'delta_lat_lag': 6.000000000128125e-05, 'delta_lon_lag': -6.000000000483397e-05, 'lat_lag': 14.81694, 'lon_lag': 120.29624, 'v_x': 0.0, 'v_y': 0.0}, '61e9f469b937134a3c4c029b': {'last_5d': Timestamp('2024-05-05 00:00:00'), 'last_time': Timestamp('2024-05-07 23:59:08'), 'delta_lat_lag': 1.3375500000000002, 'delta_lon_lag': 0.4577999999999989, 'lat_lag': 38.14875, 'lon_lag': 10.75635, 'v_x': 35.99901704934856, 'v_y': 5.059331904581065}, '61e9f45bb937134a3c4c0221': {'last_5d': Timestamp('2024-05-05 00:00:00'), 'last_time': Timestamp('2024-05-07 23:52:34'), 'delta_lat_lag': -0.0001500000000049, 'delta_lon_lag': -0.0009200000000078, 'lat_lag

In [None]:
print(FEATURES)

def get_velocities(x_0, x_1, delta_time) -> tuple:
    dist = distance((x_0[0], x_0[1]), (x_1[0], x_1[1])).km * 1000
    speed = dist / delta_time

    adj, opp = x_1[1] - x_1[0], x_0[1] - x_0[0]

    v_x = speed * (adj / dist)
    v_y = speed * (opp / dist)

    return (v_x, v_y)

['delta_time', 'delta_5d', 'delta_lat_lag', 'delta_lon_lag', 'lat_lag', 'lon_lag', 'v_x', 'v_y']


In [547]:
final = []
test = pd.read_csv("../CSV/ais_test.csv", sep=",")
test["time"] = pd.to_datetime(test["time"])

for i in test["vesselId"].unique()[0:10]:
    boat = test[test["vesselId"] == i].reset_index(drop=True)

    #Delta time
    boat["delta_time"] = boat["time"] - boat["time"].shift(1)
    boat.at[0, "delta_time"] = boat.at[0, "time"] - stats[i]["last_time"]
    boat["delta_time"] = boat["delta_time"].dt.total_seconds()

    boat["delta_5d"] = boat["time"] - stats[i]["last_5d"]
    boat["delta_5d"] = boat["delta_5d"].dt.total_seconds()

    #Rest of initial conditions
    for stat in stats[i].keys():
        if stat != "last_5d" and stat != "last_time":
            boat.at[0, stat] = stats[i][stat]

    boat["latitude"] = np.nan
    boat["longitude"] = np.nan

    #Iterative
    for j in range(1, boat.shape[0]):
        current_input = boat[FEATURES][j-1:j]
        current_pred = xgb.predict(current_input)[0]

        new_lat, new_lon = current_input["lat_lag"] + current_pred[0], current_input["lon_lag"] + current_pred[1]
        boat.at[j-1, "latitude"] = new_lat
        boat.at[j-1, "longitude"] = new_lon

        #Create next input
        boat.at[j, "lat_lag"] = new_lat
        boat.at[j, "lon_lag"] = new_lon

        boat.at[j, "delta_lat_lag"] = current_pred[0]
        boat.at[j, "delta_lon_lag"] = current_pred[1]

        prev_pos = (float(current_input["lat_lag"]), float(current_input["lon_lag"]))
        new_pos = (float(new_lat), float(new_lon))
        delta_time = boat.at[j, "delta_time"]

        v = get_velocities(prev_pos, new_pos, delta_time)

        boat.at[j, "v_x"] = v[0]
        boat.at[j, "v_y"] = v[1]

    for _,row in boat.iterrows():
        final.append(row.to_dict())


In [548]:
df = pd.DataFrame(final).sort_values("ID")

df.head()

Unnamed: 0,ID,vesselId,time,scaling_factor,delta_time,delta_5d,delta_lat_lag,delta_lon_lag,lat_lag,lon_lag,v_x,v_y,latitude,longitude
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,0.3,900.0,259396.0,0.0,0.0,31.14648,-81.49789,-0.0,0.0,31.146461,-81.497669
233,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,0.3,541.0,259577.0,6e-05,-6e-05,14.81694,120.29624,0.0,0.0,14.817718,120.295603
410,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,0.3,654.0,259802.0,1.33755,0.4578,38.14875,10.75635,35.999017,5.059332,38.207781,10.751034
547,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,0.3,1080.0,259834.0,-0.00015,-0.00092,-43.53815,172.83516,0.151716,-0.121547,-43.537599,172.83551
697,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,0.3,1258.0,259947.0,0.0,-0.00055,48.53133,-6.1075,0.209,-0.544464,48.530652,-6.109547


In [550]:
vessel = "61e9f469b937134a3c4c029b"

fig = (
    ins.visualize_vessel_movements(processed[processed["vesselId"] == vessel])
)
fig.show()

fig = (
    ins.visualize_vessel_movements(df[df["vesselId"] == vessel])
)
fig.show()