In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import preprocess as pre

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import sktime


In [17]:
import sktime.datatypes

def sktimeProcess(data: pd.DataFrame):
    """
    Given dataframe data, return dataframes X,y in the format sktime uses
    """

    #Add a numeric vessel ID
    ids = data["vesselId"].unique()
    id_dict = {ids[i]:i for i in range(len(ids))}
    data["numericalVesselId"] = data["vesselId"].apply(lambda id: id_dict[id])
    data.drop(columns=["vesselId"], inplace=True)

    #Multi index this bitch OnG
    data.set_index(["numericalVesselId", "time"], inplace=True)
    data.sort_values(["numericalVesselId", "time"], inplace=True)

    #Produce X,y
    y = data[["latitude", "longitude"]]
    data.drop(columns=["latitude", "longitude"], inplace=True)


    #Check if this is correct format
    sktime.datatypes.check_raise(data, mtype="pd-multiindex")

    return data, y



In [41]:
#Generate train/test data
data = pd.read_csv("../CSV/big_files/ais_train_processed.csv", sep="|")
data.drop(columns=["portId"], inplace=True)

X, y = sktimeProcess(data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

y_train_lat = pd.DataFrame(y_train["latitude"])
y_train_lat.index = X_train.index

print(y_train_lat)


                              latitude
numericalVesselId time                
0                 1704067225 -34.74370
                  1704089348 -35.16787
                  1704091099 -35.16863
                  1704092335 -35.16805
                  1704093356 -35.16715
...                                ...
473               1711661889   1.18471
                  1711663094   1.20012
                  1711664055   1.21734
                  1713162956 -34.68903
                  1713163130 -34.68007

[1214641 rows x 1 columns]


In [9]:
def evaluate(model: str, y_pred):

    n = len(y_test)
    p = X_test.shape[1]

    mse_latitude = mean_squared_error(y_test, y_pred)
    mse_longitude = mean_squared_error(y_test, y_pred)
    r2_latitude = r2_score(y_test, y_pred)
    r2_latitude_adj = 1 - (1 - r2_latitude) * ((n - 1) / (n - p - 1))
    r2_longitude = r2_score(y_test, y_pred)
    r2_longitude_adj = 1 - (1 - r2_longitude) * ((n - 1) / (n - p - 1))


    print(f"---- {model} Metrics ----")
    print(f"Mean Squared Error (Latitude): {mse_latitude:.4f}")
    print(f"R-squared (Latitude): {r2_latitude:.4f}, Adjusted R-squared (Latitude): {r2_latitude_adj: .4f}")
    print(f"Mean Squared Error (Longitude): {mse_longitude:.4f}")
    print(f"R-squared (Longitude): {r2_longitude:.4f}, Adjusted R-squared (Latitude): {r2_longitude_adj: .4f}")


In [43]:
# --- Random Forest ---
from sktime.forecasting.compose import make_reduction
from sklearn.ensemble import RandomForestRegressor

start = time.time()
rfr = RandomForestRegressor()
rfr = make_reduction(rfr, strategy="recursive")
rfr.fit(X_train, y_train_lat)
print(time.time()-start)

rfr_pred_lat = rfr.predict(X_test)

evaluate("Random Forest", rfr_pred_lat)

KeyboardInterrupt: 