# AIS Trajectory

### Importere biblioteker

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import xgboost as xgb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Label encoding
from sklearn.preprocessing import LabelEncoder



In [2]:
# Load data
ais_evaluations = pd.read_csv('data/ais_test.csv')
ais_train = pd.read_csv('data/ais_train.csv', sep='|')

# Preprocess data
ais_train['datetime'] = pd.to_datetime(ais_train['time'])
ais_evaluations['datetime'] = pd.to_datetime(ais_evaluations['time'])


# Dropp the vessels that appears less than 100 times 
vessel_counts = ais_train['vesselId'].value_counts().reset_index()
vessel_counts.columns = ['vesselId', 'count']
vessel_counts = vessel_counts[vessel_counts['count'] > 200]

# Convert time to integer
ais_train = ais_train.sort_values("datetime")
ais_train['time_idx'] = ((ais_train['datetime'] - ais_train['datetime'].min()).dt.total_seconds() / 3600).astype(int)

ais_evaluations = ais_evaluations.sort_values("datetime")
ais_evaluations['time_idx'] = ((ais_evaluations['datetime'] - ais_evaluations['datetime'].min()).dt.total_seconds() / 3600).astype(int)

# Drop etaRaw
ais_train = ais_train.drop(columns=['etaRaw', 'portId'])

# Create time features and drop datetime
ais_train['hour'] = ais_train['datetime'].dt.hour
ais_train['weekday'] = ais_train['datetime'].dt.weekday
ais_train['month'] = ais_train['datetime'].dt.month
ais_train['year'] = ais_train['datetime'].dt.year

ais_evaluations['hour'] = ais_evaluations['datetime'].dt.hour
ais_evaluations['weekday'] = ais_evaluations['datetime'].dt.weekday
ais_evaluations['month'] = ais_evaluations['datetime'].dt.month
ais_evaluations['year'] = ais_evaluations['datetime'].dt.year

# Drop datetime
ais_train = ais_train.drop(columns=['time'])
ais_evaluations = ais_evaluations.drop(columns=['time'])

# Label encoder
label_encoder = LabelEncoder()
ais_train['vesselId'] = label_encoder.fit_transform(ais_train['vesselId'])
ais_evaluations['vesselId'] = label_encoder.transform(ais_evaluations['vesselId'])

display(ais_evaluations)
display(ais_train)

df = ais_train.copy()


Unnamed: 0,ID,vesselId,scaling_factor,datetime,time_idx,hour,weekday,month,year
0,0,84,0.3,2024-05-08 00:03:16,0,0,2,5,2024
1,1,623,0.3,2024-05-08 00:06:17,0,0,2,5,2024
2,2,596,0.3,2024-05-08 00:10:02,0,0,2,5,2024
3,3,542,0.3,2024-05-08 00:10:34,0,0,2,5,2024
4,4,1,0.3,2024-05-08 00:12:27,0,0,2,5,2024
...,...,...,...,...,...,...,...,...,...
51737,51737,574,0.1,2024-05-12 23:59:58,119,23,6,5,2024
51733,51733,332,0.1,2024-05-12 23:59:58,119,23,6,5,2024
51732,51732,187,0.1,2024-05-12 23:59:58,119,23,6,5,2024
51734,51734,48,0.1,2024-05-12 23:59:58,119,23,6,5,2024


Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,datetime,time_idx,hour,weekday,month,year
0,284.0,0.7,0,88,0,-34.74370,-57.85130,50,2024-01-01 00:00:25,0,0,0,1,2024
1,109.6,0.0,-6,347,1,8.89440,-79.47939,189,2024-01-01 00:00:36,0,0,0,1,2024
2,111.0,11.0,0,112,0,39.19065,-76.47567,432,2024-01-01 00:01:45,0,0,0,1,2024
3,96.4,0.0,0,142,1,-34.41189,151.02067,110,2024-01-01 00:03:11,0,0,0,1,2024
4,214.0,19.7,0,215,0,35.88379,-5.91636,356,2024-01-01 00:03:51,0,0,0,1,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522055,25.1,2.8,2,22,0,41.33699,2.15130,37,2024-05-07 23:59:07,3071,23,1,5,2024
1522062,269.8,14.9,-1,270,0,49.71372,-5.22042,459,2024-05-07 23:59:08,3071,23,1,5,2024
1522063,8.0,18.7,0,6,0,38.27895,10.78280,596,2024-05-07 23:59:08,3071,23,1,5,2024
1522061,12.3,17.1,0,13,0,38.96142,-12.00502,85,2024-05-07 23:59:08,3071,23,1,5,2024


In [3]:
display(df)

def create_features(df):
    """
    Create time series features from datetime

    """
    df['second'] = df['datetime'].dt.second
    df['minute'] = df['datetime'].dt.minute
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    return df

df = create_features(df)

Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,datetime,time_idx,hour,weekday,month,year
0,284.0,0.7,0,88,0,-34.74370,-57.85130,50,2024-01-01 00:00:25,0,0,0,1,2024
1,109.6,0.0,-6,347,1,8.89440,-79.47939,189,2024-01-01 00:00:36,0,0,0,1,2024
2,111.0,11.0,0,112,0,39.19065,-76.47567,432,2024-01-01 00:01:45,0,0,0,1,2024
3,96.4,0.0,0,142,1,-34.41189,151.02067,110,2024-01-01 00:03:11,0,0,0,1,2024
4,214.0,19.7,0,215,0,35.88379,-5.91636,356,2024-01-01 00:03:51,0,0,0,1,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522055,25.1,2.8,2,22,0,41.33699,2.15130,37,2024-05-07 23:59:07,3071,23,1,5,2024
1522062,269.8,14.9,-1,270,0,49.71372,-5.22042,459,2024-05-07 23:59:08,3071,23,1,5,2024
1522063,8.0,18.7,0,6,0,38.27895,10.78280,596,2024-05-07 23:59:08,3071,23,1,5,2024
1522061,12.3,17.1,0,13,0,38.96142,-12.00502,85,2024-05-07 23:59:08,3071,23,1,5,2024


In [4]:
# Lag features

features = ['latitude', 'longitude', 'sog', 'cog', 'second', 'minute', 'hour', 'weekday', 'month', 'year']

def add_lags(df):
    for feature in features:
        for i in range(1, 4):
            df[f'{feature}_lag_{i}'] = df.groupby('vesselId')[feature].shift(i)

    return df

df = add_lags(df)

display(df.sort_values(['vesselId', 'time_idx']))

# Exract all the features
features_after_lag = [col for col in df.columns if col not in ['datetime', 'time_idx', 'vesselId', 'latitude', 'longitude']]

Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,datetime,time_idx,...,hour_lag_3,weekday_lag_1,weekday_lag_2,weekday_lag_3,month_lag_1,month_lag_2,month_lag_3,year_lag_1,year_lag_2,year_lag_3
131115,308.1,17.1,-6,316,0,7.50361,77.58340,0,2024-01-12 14:07:47,278,...,,,,,,,,,,
131279,307.6,17.3,5,313,0,7.57302,77.49505,0,2024-01-12 14:31:00,278,...,,4.0,,,1.0,,,2024.0,,
131514,306.8,16.9,5,312,0,7.65043,77.39404,0,2024-01-12 14:57:23,278,...,,4.0,4.0,,1.0,1.0,,2024.0,2024.0,
131696,307.9,16.9,6,313,0,7.71275,77.31394,0,2024-01-12 15:18:48,279,...,14.0,4.0,4.0,4.0,1.0,1.0,1.0,2024.0,2024.0,2024.0
131885,307.0,16.3,7,313,0,7.77191,77.23585,0,2024-01-12 15:39:47,279,...,14.0,4.0,4.0,4.0,1.0,1.0,1.0,2024.0,2024.0,2024.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521244,324.1,13.5,-2,325,0,59.63337,21.43237,687,2024-05-07 22:36:16,3070,...,21.0,1.0,1.0,1.0,5.0,5.0,5.0,2024.0,2024.0,2024.0
1521409,324.2,13.3,-3,326,0,59.69588,21.34225,687,2024-05-07 22:57:05,3070,...,21.0,1.0,1.0,1.0,5.0,5.0,5.0,2024.0,2024.0,2024.0
1521625,356.5,12.2,-1,354,0,59.76388,21.35317,687,2024-05-07 23:17:54,3071,...,22.0,1.0,1.0,1.0,5.0,5.0,5.0,2024.0,2024.0,2024.0
1521821,52.6,17.3,3,50,0,59.83316,21.38489,687,2024-05-07 23:38:13,3071,...,22.0,1.0,1.0,1.0,5.0,5.0,5.0,2024.0,2024.0,2024.0


In [5]:
# Train using cross validation

tss = TimeSeriesSplit(n_splits=5, test_size=1000, gap=1)
df = df.sort_values('time_idx')

fold = 0
preds = []
scores = []


for train_index, test_index in tss.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    train = create_features(train)
    test = create_features(test)

    FEATURES = features_after_lag

    TARGETS = ['latitude', 'longitude']

    X_train = train[FEATURES]
    y_train = train[TARGETS]

    X_test = test[FEATURES]
    y_test = test[TARGETS]

    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                            n_estimators=100,
                            early_stopping_rounds=10,
                            objective='reg:linear',
                            max_depth=3,
                            learning_rate=0.01,
    )

    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)
    
    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second'] = df['datetime'].dt.second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df['datetime'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

[0]	validation_0-rmse:57.19422	validation_1-rmse:53.99097
[99]	validation_0-rmse:21.81415	validation_1-rmse:21.67503


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second'] = df['datetime'].dt.second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df['datetime'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

[0]	validation_0-rmse:57.19217	validation_1-rmse:53.93666
[99]	validation_0-rmse:21.81402	validation_1-rmse:20.54904


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second'] = df['datetime'].dt.second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df['datetime'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

[0]	validation_0-rmse:57.19010	validation_1-rmse:53.68280
[99]	validation_0-rmse:21.81330	validation_1-rmse:20.44725


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second'] = df['datetime'].dt.second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df['datetime'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

[0]	validation_0-rmse:57.18783	validation_1-rmse:54.62201
[99]	validation_0-rmse:21.81249	validation_1-rmse:20.80080


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second'] = df['datetime'].dt.second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = df['datetime'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

[0]	validation_0-rmse:57.18621	validation_1-rmse:53.22176
[99]	validation_0-rmse:21.81180	validation_1-rmse:20.13884


In [6]:
print(f"Score across folds {np.mean(scores):0.4f}")
print(f"Fold Scores: {scores}")

Score across folds 20.7222
Fold Scores: [21.675025875830094, 20.549035211779692, 20.447252039951522, 20.800804655799407, 20.13884466546735]


In [7]:
# Predicting the future
df = create_features(df)

FEATURES = features_after_lag  # Ensure FEATURES is a flat list
TARGETS = ['latitude', 'longitude']

X_all = df[FEATURES]
y_all = df[TARGETS]

reg = xgb.XGBRegressor(base_score=0.5,
                        booster='gbtree',
                        n_estimators=500,
                        objective='reg:linear',
                        max_depth=3,
                        learning_rate=0.01,)

reg.fit(X_all, y_all,
        eval_set=[(X_all, y_all)],
        verbose=100)



[0]	validation_0-rmse:57.18368
[100]	validation_0-rmse:21.60496
[200]	validation_0-rmse:8.99063
[300]	validation_0-rmse:5.16689
[400]	validation_0-rmse:4.35318
[499]	validation_0-rmse:4.19160


In [8]:
# create future dataframe
df = ais_train.copy()
future = 3071 + np.arange(1000)
future_df = pd.DataFrame(future, columns=['time_idx'])

future_df['isFuture'] = True
df['isFuture'] = False

df_and_future = pd.concat([df, future_df]).copy()
df_and_future = create_features(df_and_future).copy()
df_and_future = add_lags(df_and_future).copy()

display(future_df)
display(df_and_future)


Unnamed: 0,time_idx,isFuture
0,3071,True
1,3072,True
2,3073,True
3,3074,True
4,3075,True
...,...,...
995,4066,True
996,4067,True
997,4068,True
998,4069,True


Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,datetime,time_idx,...,hour_lag_3,weekday_lag_1,weekday_lag_2,weekday_lag_3,month_lag_1,month_lag_2,month_lag_3,year_lag_1,year_lag_2,year_lag_3
0,284.0,0.7,0.0,88.0,0.0,-34.74370,-57.85130,50.0,2024-01-01 00:00:25,0,...,,,,,,,,,,
1,109.6,0.0,-6.0,347.0,1.0,8.89440,-79.47939,189.0,2024-01-01 00:00:36,0,...,,,,,,,,,,
2,111.0,11.0,0.0,112.0,0.0,39.19065,-76.47567,432.0,2024-01-01 00:01:45,0,...,,,,,,,,,,
3,96.4,0.0,0.0,142.0,1.0,-34.41189,151.02067,110.0,2024-01-01 00:03:11,0,...,,,,,,,,,,
4,214.0,19.7,0.0,215.0,0.0,35.88379,-5.91636,356.0,2024-01-01 00:03:51,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,,NaT,4066,...,,,,,,,,,,
996,,,,,,,,,NaT,4067,...,,,,,,,,,,
997,,,,,,,,,NaT,4068,...,,,,,,,,,,
998,,,,,,,,,NaT,4069,...,,,,,,,,,,


In [9]:
future_w_features = df_and_future.query('isFuture == True').copy()

In [10]:
future_w_features[['longitude_predicted', 'latitude_predicted']] = reg.predict(future_w_features[FEATURES])

display(future_w_features)

Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,datetime,time_idx,...,weekday_lag_2,weekday_lag_3,month_lag_1,month_lag_2,month_lag_3,year_lag_1,year_lag_2,year_lag_3,longitude_predicted,latitude_predicted
0,,,,,,,,,NaT,3071,...,,,,,,,,,22.291279,26.434549
1,,,,,,,,,NaT,3072,...,,,,,,,,,22.291279,26.434549
2,,,,,,,,,NaT,3073,...,,,,,,,,,22.291279,26.434549
3,,,,,,,,,NaT,3074,...,,,,,,,,,22.291279,26.434549
4,,,,,,,,,NaT,3075,...,,,,,,,,,22.291279,26.434549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,,NaT,4066,...,,,,,,,,,22.291279,26.434549
996,,,,,,,,,NaT,4067,...,,,,,,,,,22.291279,26.434549
997,,,,,,,,,NaT,4068,...,,,,,,,,,22.291279,26.434549
998,,,,,,,,,NaT,4069,...,,,,,,,,,22.291279,26.434549
