In [1]:
from settings import *

from src.data.preprocessing import preprocess, features_input


import torch
from torch import nn

import pandas as pd
import joblib

from sklearn.preprocessing import MinMaxScaler

from typing import Dict, Union, Callable


In [2]:
from src.model.transformer_enc import EncoderModel

In [3]:
model = EncoderModel()

model_scripted = torch.jit.script(model)
model_scripted.save(MODEL_FOLDER.joinpath("test.pt")) 



In [3]:

ais_train = pd.read_csv(AIS_TRAIN, sep='|')
ais_train['time'] = pd.to_datetime(ais_train['time'])

ais_test = pd.read_csv(AIS_TEST, sep=",")
ais_test['time'] = pd.to_datetime(ais_test['time']) 


In [8]:
list(range(10)) * 2

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
seq_len = 48
do_preprocess = False

In [4]:
def pipeline(seq_len = 48, do_preprocess = True):
    if do_preprocess:
        ais_train = pd.read_csv(AIS_TRAIN, sep='|')
        ais_train['time'] = pd.to_datetime(ais_train['time'])

        ais_test = pd.read_csv(AIS_TEST, sep=",")
        ais_test['time'] = pd.to_datetime(ais_test['time']) 

        X_train, X_val, y_train, y_val, test_set, scaler, dropped_vessel_ids = preprocess(
            ais_train, 
            ais_test,
            seq_type="n_in_1_out",
            seq_len=seq_len,
            seq_len_out=1,
            verbose=True,
            to_torch=True,
            parallelize_seq = True,
            scaler=MinMaxScaler()
        )

        X_train = torch.Tensor(X_train)
        y_train = torch.Tensor(y_train)

        X_val = torch.Tensor(X_val)
        y_val = torch.Tensor(y_val)

        torch.save(X_train, LAST_PREPROCESS_FOLDER.joinpath("X_train.pt"))
        torch.save(y_train, LAST_PREPROCESS_FOLDER.joinpath("y_train.pt"))
        torch.save(X_val, LAST_PREPROCESS_FOLDER.joinpath("X_val.pt"))
        torch.save(y_val, LAST_PREPROCESS_FOLDER.joinpath("y_val.pt"))

        joblib.dump(scaler, LAST_PREPROCESS_FOLDER.joinpath("scaler")) 
        test_set.to_csv(LAST_PREPROCESS_FOLDER.joinpath("test_set.csv"))

    else:
        try:
            
            X_train = torch.load(LAST_PREPROCESS_FOLDER.joinpath("X_train.pt"), weights_only=True)
            y_train = torch.load(LAST_PREPROCESS_FOLDER.joinpath("y_train.pt"), weights_only=True)
            X_val = torch.load(LAST_PREPROCESS_FOLDER.joinpath("X_val.pt"), weights_only=True)
            y_val = torch.load(LAST_PREPROCESS_FOLDER.joinpath("y_val.pt"), weights_only=True)

            scaler = joblib.load(LAST_PREPROCESS_FOLDER.joinpath("scaler")) 
            test_set = pd.read_csv(LAST_PREPROCESS_FOLDER.joinpath("test_set.csv"))
            
            dropped_vessel_ids = ['61e9f3adb937134a3c4bfe37', '61e9f3cbb937134a3c4bff09']

        except:
            print(f"ERROR: File missing in {str(LAST_PREPROCESS_FOLDER)}. Now run preprocessing...")
            return pipeline(seq_len=seq_len, do_preprocess=True, paralize_preprocess=True)
        
    return X_train, X_val, y_train, y_val, test_set, scaler, dropped_vessel_ids


In [7]:
X_train, X_val, y_train, y_val, test_set, scaler, dropped_vessel_ids = pipeline(seq_len, do_preprocess)

In [16]:
# df_val = ais_train.copy()[ais_train["vesselId"] not in dropped_vessel_ids].sort_values(["vesselId","time"])
df_val = ais_train.copy()[~ais_train["vesselId"].isin(dropped_vessel_ids)].sort_values(["vesselId", "time"])
df_val

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
131115,2024-01-12 14:07:47,308.1,17.1,-6,316,0,01-08 06:00,7.50361,77.58340,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c
131279,2024-01-12 14:31:00,307.6,17.3,5,313,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131514,2024-01-12 14:57:23,306.8,16.9,5,312,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131696,2024-01-12 15:18:48,307.9,16.9,6,313,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
131885,2024-01-12 15:39:47,307.0,16.3,7,313,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546
...,...,...,...,...,...,...,...,...,...,...,...
1521244,2024-05-07 22:36:16,324.1,13.5,-2,325,0,05-08 03:00,59.63337,21.43237,clh6aqawa0007gh0z9h6zi9bo,61d373b83aeaecc07011a62b
1521409,2024-05-07 22:57:05,324.2,13.3,-3,326,0,05-08 03:00,59.69588,21.34225,clh6aqawa0007gh0z9h6zi9bo,61d373b83aeaecc07011a62b
1521625,2024-05-07 23:17:54,356.5,12.2,-1,354,0,05-08 03:00,59.76388,21.35317,clh6aqawa0007gh0z9h6zi9bo,61d373b83aeaecc07011a62b
1521821,2024-05-07 23:38:13,52.6,17.3,3,50,0,05-08 03:00,59.83316,21.38489,clh6aqawa0007gh0z9h6zi9bo,61d373b83aeaecc07011a62b


In [40]:
import numpy as np


v_min = np.inf
v_id = ""
for vessel_id in list(ais_test["vesselId"].unique()):
    if len(ais_train[ais_train["vesselId"]==vessel_id]) < v_min:
        v_id = vessel_id
        v_min = len(ais_train[ais_train["vesselId"]==vessel_id])

In [41]:
v_min

328

In [44]:
import numpy as np


v_max = -np.inf
v_id = ""
for vessel_id in list(ais_test["vesselId"].unique()):
    if len(ais_test[ais_test["vesselId"]==vessel_id]) > v_max:
        v_id = vessel_id
        v_max = len(ais_test[ais_test["vesselId"]==vessel_id])

In [45]:
v_max

360

In [47]:
torch.empty(100, 48, 6).mean(dim=1).reshape(100,1,-1).shape

torch.Size([100, 1, 6])

In [34]:
from src.data.features import create_time_diff_feature

ais_tr_dev = create_time_diff_feature(ais_train)
ais_tr_dev.sort_values(["vesselId", "time"])[["vesselId", "time", "time_diff"]]

Unnamed: 0,vesselId,time,time_diff
131115,61e9f38eb937134a3c4bfd8b,2024-01-12 14:07:47,1393.0
131279,61e9f38eb937134a3c4bfd8b,2024-01-12 14:31:00,1583.0
131514,61e9f38eb937134a3c4bfd8b,2024-01-12 14:57:23,1285.0
131696,61e9f38eb937134a3c4bfd8b,2024-01-12 15:18:48,1259.0
131885,61e9f38eb937134a3c4bfd8b,2024-01-12 15:39:47,901.0
...,...,...,...
1521244,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 22:36:16,1249.0
1521409,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 22:57:05,1249.0
1521625,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 23:17:54,1219.0
1521821,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 23:38:13,1248.0


In [36]:
39 * 60 + 47 - (18 * 60 + 48)

1259

In [24]:
(X_train.shape[0] + X_val.shape[0]) - (len(ais_train["vesselId"].unique()) - 2) * 48

1455491

In [8]:
dropped_vessel_ids = ['61e9f3adb937134a3c4bfe37', '61e9f3cbb937134a3c4bff09']

In [13]:
len(ais_train[ais_train["vesselId"]=="61e9f3adb937134a3c4bfe37"])

31

In [None]:
from typing import List
def make_df_val(
        train_set: pd.DataFrame,
        dropped_vessel_ids: List[str],        
        idx: int,

    ) -> pd.DataFrame:
    df_val = train_set.copy()[train_set["vessel_id"] not in dropped_vessel_ids].sort_values("time")
    train_set.iloc[:]

In [None]:
df_val = train_set.copy()[train_set["vessel_id"] not in dropped_vessel_ids].sort_values("time")


In [6]:
dim_ffn = 126
d_model = 32
activation_dec: Union[str | Callable[[torch.Tensor], torch.Tensor]] = nn.SiLU()

transformer_decoder_params = {
    "d_model": d_model,
    "nhead": 8,
    # "num_encoder_layers": 6,
    # "num_decoder_layers": 2,
    "dim_feedforward": dim_ffn,
    "dropout": 0.1,
    "activation": activation_dec,
    "layer_norm_eps": 0.00001,
    "batch_first": True,
    "norm_first": False,
    # "bias": True,
    "device": DEVICE,
}

In [7]:
class DecoderModel(nn.Module):
    def __init__(
            self,
            decoder_params: Dict[int,Union[int, float, bool]] = transformer_decoder_params, 
            num_features: int = 7, 
            num_outputs: int = 6, 
            num_layers: int = 1,
            act_out: nn.Module | None = None
        ) -> None:
        super().__init__()
        self.emb_layer = nn.Linear(num_features, d_model)
        dec_layer = nn.TransformerDecoderLayer(**decoder_params)
        self.model = nn.TransformerDecoder(dec_layer, num_layers=num_layers)
        self.ffn = nn.Linear(d_model, num_outputs)
        self.act_out = act_out # nn.Sigmoid()
        
    def forward(self, x):
        len_b, len_s, _ = x.shape
        emb = self.emb_layer(x)
        out = self.model(emb, emb)
        out = out[:, -1, :].view(len_b, 1, -1)
        
        if self.act_out:
            return self.act_out(self.ffn(out))
        return self.ffn(out)


model = DecoderModel(act_out=nn.Sigmoid())

In [8]:
from src.train.trainer import Trainer


trainer = Trainer(
    model=model,
    loss=nn.MSELoss(),
    optimizer=torch.optim.AdamW(params=model.parameters()),
    device=DEVICE,
)

In [9]:
X_train = torch.Tensor(X_train).to(DEVICE)
y_train = torch.Tensor(y_train).to(DEVICE)

X_val = torch.Tensor(X_val).to(DEVICE)
y_val = torch.Tensor(y_val).to(DEVICE)

In [10]:

trainer.fit(
    X=X_train,
    y=y_train,
    # X_val=X_val,
    # y_val=y_val,
    epochs=1,
    eval_on_test=True,
    k_folds=0,
)


In [13]:
score = trainer.eval(X_val, y_val)

In [None]:
import numpy as np

try:
    print("Score on validation set (rmse):", np.sqrt(score))
except:
    print("Score on validation set (rmse):", np.sqrt(score.cpu().numpy()))


In [72]:
def iterative_forecast(seq, model, steps, sequence_length):
    predicted = []
    current_sequence = seq[:sequence_length].reshape(1,sequence_length,7)
    # current_sequence = last_known[-sequence_length:]
    for k in range(steps):
        # next_pred = model.predict(current_sequence.reshape(1, sequence_length, -1))[0]
        x_test = torch.Tensor(current_sequence).to(DEVICE)
        y_pred = model.predict(x_test)[0,0,:]

        predicted.append(y_pred)
        seq[seq_len+k] = np.array([seq[k+1][0], *y_pred])
        
        current_sequence = seq[k+1:k+1+seq_len].reshape(1,seq_len,7)
    print(np.array(predicted).shape)
    return predicted

In [None]:
# PREDICTION STEP

grouped_test = test_set.groupby("vesselId")

predictions = []

sequence_length = seq_len

for vessel_id, group in tqdm(grouped_test, colour="green"):
    forecast_steps = len(group['time'].values) - seq_len

    last_known_features = group[features_input].values

    future_preds = iterative_forecast(last_known_features, trainer, forecast_steps, sequence_length)
    
    group.loc[group.index[seq_len:],['cog', 'sog', 'rot', 'heading', 'latitude', 'longitude']] = future_preds
    
    group[features_input] = scaler.inverse_transform(group[features_input])
    predictions.append(group.copy())

df_preds = pd.concat(predictions, ignore_index=True)

In [120]:
res = df_preds[["ID","longitude","latitude"]].sort_values("ID")[:51739]
res = res.reset_index().drop(columns="index")

In [None]:
# SUBMIT RESULT
import uuid
# res = pd.merge(ais_test, df_preds[["vesselId","time", "latitude", "longitude"]],on=["time", "vesselId"], how="left")
res["longitude_predicted"] = res["longitude"]
res["latitude_predicted"] = res["latitude"]
# res = df_preds[["ID"]]
# res["id"] = res["ID"]
res = res.drop(columns=["longitude", "latitude"])

def make_file_name() -> str:
    file_name = str(uuid.uuid4()) + ".csv"
    print(f"Submission file name is: {file_name}")
    return file_name

def submit(forecast: pd.DataFrame, file_name: str = None) -> None:
    sample_submission = pd.read_csv(AIS_SAMPLE_SUBMISSION)
    file_name = file_name if file_name else make_file_name()

    repertory = SUBMISSION_FODLER.joinpath(file_name)
    sample_submission = sample_submission[['ID']].merge(forecast[["ID","longitude_predicted","latitude_predicted"]], on='ID', how='left')
    try:
        sample_submission.to_csv(repertory, index=False)
    except:
        print("Error register file")
        submit(forecast)

submit(res)