# AIS Trajectory

### Importere biblioteker

In [79]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb

import matplotlib as plt

# split sklearn
from sklearn.model_selection import train_test_split

### Importer data

In [80]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

# rename longitude and latitude in ports
ports_df = ports_df.rename(columns={'longitude': 'port_longitude', 'latitude': 'port_latitude'})

#vessel0 = train_df['vesselId'][0]
original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

display(train_df.head())

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


### Pre-prosessering

In [81]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

# Drop columns that are not in the test data
train_df = train_df.drop(columns=['cog', 'sog', 'rot', 'heading', 'etaRaw', 'navstat'])

# Label encode the vesselId
le = LabelEncoder()
train_df['vesselId'] = le.fit_transform(train_df['vesselId'])
test_df['vesselId'] = le.transform(test_df['vesselId'])

# Navstat 0 and 8 means the vessel is moving
#train_df['Moored'] = train_df['navstat'].apply(lambda x: False if x in [0, 8] else True)
#train_df = train_df.drop(columns=['navstat'])

# merge port data with train data
train_df = pd.merge(train_df, ports_df, how='left', left_on='portId', right_on='portId')

# Remove nan
train_df = train_df.dropna()


# Convert timestamps to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

display(train_df)
display(test_df)


Unnamed: 0,time,latitude,longitude,vesselId,portId,name,portLocation,port_longitude,port_latitude,UN_LOCODE,countryName,ISO
0,2024-01-01 00:00:25,-34.74370,-57.85130,50,61d371c43aeaecc07011a37f,Puerto San Antonio,San Antonio,-71.618889,-33.587500,CLSAI,Chile,CL
1,2024-01-01 00:00:36,8.89440,-79.47939,189,634c4de270937fc01c3a7689,"Panamá, Ciudad de","Panamá, Ciudad de",-79.533000,8.967000,PAPTY,Panama,PA
2,2024-01-01 00:01:45,39.19065,-76.47567,432,61d3847bb7b7526e1adf3d19,Port of Baltimore,Baltimore,-76.558889,39.232500,USBAL,United States,US
3,2024-01-01 00:03:11,-34.41189,151.02067,110,61d36f770a1807568ff9a126,Port of Port Kembla,Port Kembla,150.899444,-34.462500,AUPKL,Australia,AU
4,2024-01-01 00:03:51,35.88379,-5.91636,356,634c4de270937fc01c3a74f3,Tangier,Tangier,-5.817000,35.783000,MATNG,Morocco,MA
...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,2024-05-07 23:59:07,52.19131,-5.82223,682,634c4de270937fc01c3a7417,Waterford City Quays,Waterford City Quays,-7.100000,52.250000,IEWCQ,Ireland,IE
1522061,2024-05-07 23:59:08,38.96142,-12.00502,85,634c4de270937fc01c3a76a1,Cascais,Cascais,-9.417000,38.700000,PTCAS,Portugal,PT
1522062,2024-05-07 23:59:08,49.71372,-5.22042,459,634c4de270937fc01c3a787b,Porthleven,Porthleven,-5.317000,50.083000,GBPLV,United Kingdom,GB
1522063,2024-05-07 23:59:08,38.27895,10.78280,596,61d3781293c6feb83e5eb73b,Port of Civitavecchia,Civitavecchia,11.780833,42.098889,ITCVV,Italy,IT


Unnamed: 0,ID,vesselId,time,scaling_factor
0,0,84,2024-05-08 00:03:16,0.3
1,1,623,2024-05-08 00:06:17,0.3
2,2,596,2024-05-08 00:10:02,0.3
3,3,542,2024-05-08 00:10:34,0.3
4,4,1,2024-05-08 00:12:27,0.3
...,...,...,...,...
51734,51734,48,2024-05-12 23:59:58,0.1
51735,51735,110,2024-05-12 23:59:58,0.1
51736,51736,610,2024-05-12 23:59:58,0.1
51737,51737,574,2024-05-12 23:59:58,0.1


### Feature engineering

In [82]:
def convert_time_to_features(df):
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day

    df['time'] = df['time'].astype(int) / 10**9
    
    # Add sinusoidal features for seasonality
    df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['sin_weekday'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_weekday'] = np.cos(2 * np.pi * df['weekday'] / 7)
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df = df.drop(columns=['hour', 'weekday', 'month', 'day'])
    return df

# Apply the function to train_df and test_df
train_df = convert_time_to_features(train_df)
test_df = convert_time_to_features(test_df)


display(train_df)
display(test_df)


Unnamed: 0,latitude,longitude,vesselId,portId,name,portLocation,port_longitude,port_latitude,UN_LOCODE,countryName,ISO,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month
0,-34.74370,-57.85130,50,61d371c43aeaecc07011a37f,Puerto San Antonio,San Antonio,-71.618889,-33.587500,CLSAI,Chile,CL,0.000000,1.000000,0.000000,1.00000,0.5,0.866025
1,8.89440,-79.47939,189,634c4de270937fc01c3a7689,"Panamá, Ciudad de","Panamá, Ciudad de",-79.533000,8.967000,PAPTY,Panama,PA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025
2,39.19065,-76.47567,432,61d3847bb7b7526e1adf3d19,Port of Baltimore,Baltimore,-76.558889,39.232500,USBAL,United States,US,0.000000,1.000000,0.000000,1.00000,0.5,0.866025
3,-34.41189,151.02067,110,61d36f770a1807568ff9a126,Port of Port Kembla,Port Kembla,150.899444,-34.462500,AUPKL,Australia,AU,0.000000,1.000000,0.000000,1.00000,0.5,0.866025
4,35.88379,-5.91636,356,634c4de270937fc01c3a74f3,Tangier,Tangier,-5.817000,35.783000,MATNG,Morocco,MA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,52.19131,-5.82223,682,634c4de270937fc01c3a7417,Waterford City Quays,Waterford City Quays,-7.100000,52.250000,IEWCQ,Ireland,IE,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025
1522061,38.96142,-12.00502,85,634c4de270937fc01c3a76a1,Cascais,Cascais,-9.417000,38.700000,PTCAS,Portugal,PT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025
1522062,49.71372,-5.22042,459,634c4de270937fc01c3a787b,Porthleven,Porthleven,-5.317000,50.083000,GBPLV,United Kingdom,GB,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025
1522063,38.27895,10.78280,596,61d3781293c6feb83e5eb73b,Port of Civitavecchia,Civitavecchia,11.780833,42.098889,ITCVV,Italy,IT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025


Unnamed: 0,ID,vesselId,scaling_factor,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month
0,0,84,0.3,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025
1,1,623,0.3,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025
2,2,596,0.3,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025
3,3,542,0.3,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025
4,4,1,0.3,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025
...,...,...,...,...,...,...,...,...,...
51734,51734,48,0.1,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025
51735,51735,110,0.1,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025
51736,51736,610,0.1,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025
51737,51737,574,0.1,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025


In [83]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Predict the port_longitude and port_latitude for each vessel in testset
# Define features and target
features = ['vesselId', 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'sin_month', 'cos_month']
target = ['port_longitude', 'port_latitude']

# Split the data into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_df[features], train_df[target], test_size=0.3, random_state=42)

# Initialize and train the model
model = xgb.XGBRegressor(
    n_estimators=10000,
    n_jobs=-1,
    early_stopping_rounds=50,
    learning_rate=0.01,
    max_depth=5,
)

model.fit(
    x_train, y_train,
    verbose=1,
    eval_set=[(x_train, y_train), (x_val, y_val)],
)

[0]	validation_0-rmse:52.77242	validation_1-rmse:52.74463
[1]	validation_0-rmse:52.70222	validation_1-rmse:52.67482
[2]	validation_0-rmse:52.63332	validation_1-rmse:52.60631
[3]	validation_0-rmse:52.56560	validation_1-rmse:52.53892
[4]	validation_0-rmse:52.49708	validation_1-rmse:52.47064
[5]	validation_0-rmse:52.43176	validation_1-rmse:52.40565
[6]	validation_0-rmse:52.36563	validation_1-rmse:52.33975
[7]	validation_0-rmse:52.30293	validation_1-rmse:52.27718
[8]	validation_0-rmse:52.23909	validation_1-rmse:52.21357
[9]	validation_0-rmse:52.17859	validation_1-rmse:52.15333
[10]	validation_0-rmse:52.11696	validation_1-rmse:52.09193
[11]	validation_0-rmse:52.05859	validation_1-rmse:52.03368
[12]	validation_0-rmse:51.99909	validation_1-rmse:51.97440
[13]	validation_0-rmse:51.94277	validation_1-rmse:51.91832
[14]	validation_0-rmse:51.88525	validation_1-rmse:51.86101
[15]	validation_0-rmse:51.83090	validation_1-rmse:51.80678
[16]	validation_0-rmse:51.77538	validation_1-rmse:51.75142
[17]	va

In [84]:
# Predict the port_longitude and port_latitude for each vessel in testset
preds = model.predict(test_df[features])
test_df['port_longitude'] = preds[:, 0]
test_df['port_latitude'] = preds[:, 1]

display(test_df.head())

Unnamed: 0,ID,vesselId,scaling_factor,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,port_longitude,port_latitude
0,0,84,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-46.787251,38.944469
1,1,623,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-36.825802,34.208134
2,2,596,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,10.963854,39.355537
3,3,542,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,75.832367,12.776402
4,4,1,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-1.798007,40.161938


In [85]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on validation set
y_val_pred = model.predict(x_val)

# Calculate MAE and MSE
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')

Mean Absolute Error (MAE): 23.263406139311556
Mean Squared Error (MSE): 1589.5670679843472


### Fit model

In [86]:
features = ['vesselId', 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'sin_month', 'cos_month', 'port_longitude', 'port_latitude']
targets = ['latitude', 'longitude']

# split the data into train and validation 70% train, 30% validation with scikit-learn

x_train, x_val, y_train, y_val = train_test_split(train_df[features], train_df[targets])


model = xgb.XGBRegressor(
    n_estimators=10000,
    n_jobs=-1,
    early_stopping_rounds=50,
    learning_rate=0.01,
    max_depth=5,
)

model.fit(
    x_train, y_train,
    verbose=1,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    )



[0]	validation_0-rmse:52.24867	validation_1-rmse:52.23130
[1]	validation_0-rmse:51.75008	validation_1-rmse:51.73328
[2]	validation_0-rmse:51.25668	validation_1-rmse:51.24046
[3]	validation_0-rmse:50.76847	validation_1-rmse:50.75281
[4]	validation_0-rmse:50.28535	validation_1-rmse:50.27026
[5]	validation_0-rmse:49.80732	validation_1-rmse:49.79280
[6]	validation_0-rmse:49.33429	validation_1-rmse:49.32033
[7]	validation_0-rmse:48.86623	validation_1-rmse:48.85283
[8]	validation_0-rmse:48.40332	validation_1-rmse:48.39042
[9]	validation_0-rmse:47.94531	validation_1-rmse:47.93292
[10]	validation_0-rmse:47.49210	validation_1-rmse:47.48020
[11]	validation_0-rmse:47.04366	validation_1-rmse:47.03227
[12]	validation_0-rmse:46.59999	validation_1-rmse:46.58911
[13]	validation_0-rmse:46.16098	validation_1-rmse:46.15060
[14]	validation_0-rmse:45.72659	validation_1-rmse:45.71670
[15]	validation_0-rmse:45.29682	validation_1-rmse:45.28744
[16]	validation_0-rmse:44.87141	validation_1-rmse:44.86254
[17]	va

### Predict

In [87]:
# Give metrics on the model
print(model.score(x_train, y_train))
print(model.score(x_val, y_val))


0.9813622236251831
0.9807007312774658


In [88]:
# Predict positions
preds = model.predict(test_df[features])

In [89]:
print(preds)
test_df['latitude_predicted'] = preds[:,0]
test_df['longitude_predicted'] = preds[:,1]


display(test_df.head())

[[ 43.19946   -56.091446 ]
 [ 33.021637  -27.779867 ]
 [ 38.877605   12.559684 ]
 ...
 [ 34.126713    2.4531736]
 [ 55.258823   18.683268 ]
 [ 50.60942   -30.033234 ]]


Unnamed: 0,ID,vesselId,scaling_factor,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,port_longitude,port_latitude,latitude_predicted,longitude_predicted
0,0,84,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-46.787251,38.944469,43.199459,-56.091446
1,1,623,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-36.825802,34.208134,33.021637,-27.779867
2,2,596,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,10.963854,39.355537,38.877605,12.559684
3,3,542,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,75.832367,12.776402,3.545563,92.357574
4,4,1,0.3,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,-1.798007,40.161938,39.593746,-3.711371


### Eksporter til csv

In [90]:
submission_df = test_df[['ID', 'longitude_predicted','latitude_predicted']]

submission_df.to_csv('predictions_xgb.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
