In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import preprocess as pre

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import sktime


In [73]:
import sktime.datatypes

def sktimeProcess(data: pd.DataFrame):
    """
    Given dataframe data, return dataframes X,y in the format sktime uses
    """

    #Add a numeric vessel ID
    ids = data["vesselId"].unique()
    id_dict = {ids[i]:i for i in range(len(ids))}
    data["numericalVesselId"] = data["vesselId"].apply(lambda id: id_dict[id])

    #Fix y after sorting
    data.sort_values(["numericalVesselId", "time"], inplace=True)
    y = data[["latitude", "longitude"]].reset_index(drop=True)

    #Multi index this bitch OnG
    data.drop(columns=["latitude", "longitude"], inplace=True)
    data.set_index(["numericalVesselId", "time"], inplace=True)

    #Check if this is correct format
    sktime.datatypes.check_raise(data, mtype="pd-multiindex")

    return data, y



In [78]:
#Generate train/test data
data = pd.read_csv("../CSV/big_files/ais_train_processed.csv", sep="|")
data.drop(columns=["portId"], inplace=True)

X, y = sktimeProcess(data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

Unnamed: 0,latitude,longitude
1214636,1.18471,103.61586
1214637,1.20012,103.53526
1214638,1.21734,103.47002
1214639,-34.68903,18.51702
1214640,-34.68007,18.504


In [79]:
def evaluate(model: str, y_pred):

    n = len(y_test)
    p = X_test.shape[1]

    mse_latitude = mean_squared_error(y_test, y_pred)
    mse_longitude = mean_squared_error(y_test, y_pred)
    r2_latitude = r2_score(y_test, y_pred)
    r2_latitude_adj = 1 - (1 - r2_latitude) * ((n - 1) / (n - p - 1))
    r2_longitude = r2_score(y_test, y_pred)
    r2_longitude_adj = 1 - (1 - r2_longitude) * ((n - 1) / (n - p - 1))


    print(f"---- {model} Metrics ----")
    print(f"Mean Squared Error (Latitude): {mse_latitude:.4f}")
    print(f"R-squared (Latitude): {r2_latitude:.4f}, Adjusted R-squared (Latitude): {r2_latitude_adj: .4f}")
    print(f"Mean Squared Error (Longitude): {mse_longitude:.4f}")
    print(f"R-squared (Longitude): {r2_longitude:.4f}, Adjusted R-squared (Latitude): {r2_longitude_adj: .4f}")


In [79]:
# --- Time Series KNN ---
from sktime.regression.distance_based import KNeighborsTimeSeriesRegressor

start = time.time()
KNNTS = KNeighborsTimeSeriesRegressor(n_neighbors=3, distance="euclidean")
KNNTS.fit(X_train, y_train)
print(time.time()-start)

knnts_pred = KNNTS.predict(X_test)

evaluate("KNNTS", knnts_pred)

TypeError: X must be in an sktime compatible format. Allowed scitypes for regressors are Panel mtypes, for instance a pandas.DataFrame with MultiIndex and last(-1) level an sktime compatible time index. Allowed compatible mtype format specifications are: ['nested_univ', 'numpy3D', 'numpyflat', 'pd-multiindex', 'pd-wide', 'pd-long', 'df-list', 'gluonts_ListDataset_panel', 'gluonts_PandasDataset_panel', 'polars_panel'] . See the data format tutorial examples/AA_datatypes_and_datasets.ipynb. If you think the data is already in an sktime supported input format, run sktime.datatypes.check_raise(data, mtype) to diagnose the error, where mtype is the string of the type specification you want. Error message for checked mtypes, in format [mtype: message], as follows: [df-list: obj must be list of pd.DataFrame, found <class 'pandas.core.frame.DataFrame'>]  [numpy3D: obj must be a numpy.ndarray, found <class 'pandas.core.frame.DataFrame'>]  [pd-multiindex: obj must have a MultiIndex, found <class 'pandas.core.indexes.base.Index'>]  [nested_univ: obj All columns must be object, found <class 'pandas.core.frame.DataFrame'>] 