# AIS Trajectory

### Importere biblioteker

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb

import matplotlib as plt

# split sklearn
from sklearn.model_selection import train_test_split

### Importer data

In [2]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

# rename longitude and latitude in ports
ports_df = ports_df.rename(columns={'longitude': 'port_longitude', 'latitude': 'port_latitude'})

# merge vessel and train data
train_df = pd.merge(train_df, vessels_df, on='vesselId', how='left')
test_df = pd.merge(test_df, vessels_df, on='vesselId', how='left')

# Drop columns that are not useful |"NT"|"depth"|"draft"|"enginePower"|"freshWater"|"fuel"|"homePort"|"maxHeight"|"maxSpeed"|"maxWidth"|"rampCapacity"|
train_df = train_df.drop(['NT', 'depth', 'draft', 'enginePower', 'freshWater', 'fuel', 'homePort', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity'], axis=1)
test_df = test_df.drop(['scaling_factor','NT', 'depth', 'draft', 'enginePower', 'freshWater', 'fuel', 'homePort', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity'], axis=1)
                       
# Fill missing values in test data
test_df['DWT'] = test_df['DWT'].fillna(test_df['DWT'].mean())
test_df['vesselType'] = test_df['vesselType'].fillna(test_df['vesselType'].mode()[0])
test_df['breadth'] = test_df['breadth'].fillna(test_df['breadth'].mean())

#vessel0 = train_df['vesselId'][0]
original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

display(train_df.head())
display(test_df.head())


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,yearBuilt
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,61ec65aea8cafc0e93f0e900,6500,21214.0,57718,83.0,32.0,199.0,2004
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,61be24564ea00ae59d0fe37a,5174,18878.0,59583,83.0,32.26,199.97,2012
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,61be24564ea00ae59d0fe379,6402,18383.0,59217,83.0,32.0,199.0,2005
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,61a8e672f9cba188601e84ac,5849,15199.0,55598,83.0,32.0,199.0,1995
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,61be24564ea00ae59d0fe37a,5219,18833.0,58939,83.0,32.2,199.95,2010


Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,yearBuilt
0,0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,61a8e672f9cba188601e84ac,7934,31143.0,74255,83.0,32.0,230.0,2011
1,1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,61be24574ea00ae59d0fe388,2500,13238.0,9984,14.0,20.0,124.0,2012
2,2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,27.0,186.0,2003
3,3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,61be24564ea00ae59d0fe37a,5007,13951.0,45959,83.0,30.2,183.0,2011
4,4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,61ec94f1a8cafc0e93f0e92a,4902,12325.0,46800,83.0,31.0,182.0,2006


### Pre-prosessering

In [3]:
# Drop columns that are not in the test data
train_df = train_df.drop(columns=['cog', 'sog', 'rot', 'heading', 'etaRaw', 'navstat'])

# Label encode the vesselId
le_vesselId = LabelEncoder()
le_vesselId.fit(pd.concat([train_df['vesselId'], vessels_df['vesselId']]))
train_df['vesselId'] = le_vesselId.transform(train_df['vesselId'])
test_df['vesselId'] = le_vesselId.transform(test_df['vesselId'])
vessels_df['vesselId'] = le_vesselId.transform(vessels_df['vesselId'])

# Navstat 0 and 8 means the vessel is moving
#train_df['Moored'] = train_df['navstat'].apply(lambda x: False if x in [0, 8] else True)
#train_df = train_df.drop(columns=['navstat'])

# merge port data with train data
train_df = pd.merge(train_df, ports_df, how='left', left_on='portId', right_on='portId')

# check nan values
print(train_df.isna().sum())
print(test_df.isna().sum())

# Fill missing values in train data
train_df['port_longitude'] = train_df['port_longitude'].fillna(train_df['port_longitude'].mean())
train_df['port_latitude'] = train_df['port_latitude'].fillna(train_df['port_latitude'].mean())


# Convert timestamps to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

display(train_df)
display(test_df)


time                  0
latitude              0
longitude             0
vesselId              0
portId             1615
shippingLineId        0
CEU                   0
DWT               14949
GT                    0
vesselType        20987
breadth           14949
length                0
yearBuilt             0
name               1615
portLocation       1615
port_longitude     1615
port_latitude      1615
UN_LOCODE          1615
countryName        1615
ISO                1615
dtype: int64
ID                0
vesselId          0
time              0
shippingLineId    0
CEU               0
DWT               0
GT                0
vesselType        0
breadth           0
length            0
yearBuilt         0
dtype: int64


Unnamed: 0,time,latitude,longitude,vesselId,portId,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,yearBuilt,name,portLocation,port_longitude,port_latitude,UN_LOCODE,countryName,ISO
0,2024-01-01 00:00:25,-34.74370,-57.85130,51,61d371c43aeaecc07011a37f,61ec65aea8cafc0e93f0e900,6500,21214.0,57718,83.0,32.00,199.00,2004,Puerto San Antonio,San Antonio,-71.618889,-33.587500,CLSAI,Chile,CL
1,2024-01-01 00:00:36,8.89440,-79.47939,198,634c4de270937fc01c3a7689,61be24564ea00ae59d0fe37a,5174,18878.0,59583,83.0,32.26,199.97,2012,"Panamá, Ciudad de","Panamá, Ciudad de",-79.533000,8.967000,PAPTY,Panama,PA
2,2024-01-01 00:01:45,39.19065,-76.47567,450,61d3847bb7b7526e1adf3d19,61be24564ea00ae59d0fe379,6402,18383.0,59217,83.0,32.00,199.00,2005,Port of Baltimore,Baltimore,-76.558889,39.232500,USBAL,United States,US
3,2024-01-01 00:03:11,-34.41189,151.02067,114,61d36f770a1807568ff9a126,61a8e672f9cba188601e84ac,5849,15199.0,55598,83.0,32.00,199.00,1995,Port of Port Kembla,Port Kembla,150.899444,-34.462500,AUPKL,Australia,AU
4,2024-01-01 00:03:51,35.88379,-5.91636,370,634c4de270937fc01c3a74f3,61be24564ea00ae59d0fe37a,5219,18833.0,58939,83.0,32.20,199.95,2010,Tangier,Tangier,-5.817000,35.783000,MATNG,Morocco,MA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,2024-05-07 23:59:07,52.19131,-5.82223,705,634c4de270937fc01c3a7417,61a8e673f9cba188601e84b3,300,12502.0,21005,83.0,25.00,182.00,2000,Waterford City Quays,Waterford City Quays,-7.100000,52.250000,IEWCQ,Ireland,IE
1522061,2024-05-07 23:59:08,38.96142,-12.00502,89,634c4de270937fc01c3a76a1,61a8e672f9cba188601e84ac,6354,22160.0,61328,83.0,32.00,199.00,2009,Cascais,Cascais,-9.417000,38.700000,PTCAS,Portugal,PT
1522062,2024-05-07 23:59:08,49.71372,-5.22042,477,634c4de270937fc01c3a787b,61a8e673f9cba188601e84b9,7429,18241.0,72700,83.0,38.00,199.96,2017,Porthleven,Porthleven,-5.317000,50.083000,GBPLV,United Kingdom,GB
1522063,2024-05-07 23:59:08,38.27895,10.78280,619,61d3781293c6feb83e5eb73b,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,27.00,186.00,2003,Port of Civitavecchia,Civitavecchia,11.780833,42.098889,ITCVV,Italy,IT


Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,yearBuilt
0,0,86,2024-05-08 00:03:16,61a8e672f9cba188601e84ac,7934,31143.0,74255,83.0,32.00,230.00,2011
1,1,646,2024-05-08 00:06:17,61be24574ea00ae59d0fe388,2500,13238.0,9984,14.0,20.00,124.00,2012
2,2,619,2024-05-08 00:10:02,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,27.00,186.00,2003
3,3,562,2024-05-08 00:10:34,61be24564ea00ae59d0fe37a,5007,13951.0,45959,83.0,30.20,183.00,2011
4,4,1,2024-05-08 00:12:27,61ec94f1a8cafc0e93f0e92a,4902,12325.0,46800,83.0,31.00,182.00,2006
...,...,...,...,...,...,...,...,...,...,...,...
51734,51734,49,2024-05-12 23:59:58,61a8e672f9cba188601e84ab,6000,19670.0,59705,83.0,32.00,199.00,2006
51735,51735,114,2024-05-12 23:59:58,61a8e672f9cba188601e84ac,5849,15199.0,55598,83.0,32.00,199.00,1995
51736,51736,633,2024-05-12 23:59:58,61a8e673f9cba188601e84b9,6178,18770.0,59516,83.0,32.26,199.97,2015
51737,51737,595,2024-05-12 23:59:58,61ec643ca8cafc0e93f0e8f9,840,9653.0,45923,21.0,30.00,218.00,2006


### Feature engineering

In [4]:
def convert_time_to_features(df):
    df['minute'] = df['time'].dt.minute
    df['hour'] = df['time'].dt.hour
    df['weekday'] = df['time'].dt.weekday
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day

    df['time'] = df['time'].astype(int) / 10**9
    
    # Add sinusoidal features for seasonality
    df['sin_minute'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['cos_minute'] = np.cos(2 * np.pi * df['minute'] / 60)
    df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['sin_weekday'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_weekday'] = np.cos(2 * np.pi * df['weekday'] / 7)
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['sin_day'] = np.sin(2 * np.pi * df['day'] / 31)
    df['cos_day'] = np.cos(2 * np.pi * df['day'] / 31)
    
    df = df.drop(columns=['hour', 'weekday', 'month', 'day', 'minute'])
    return df

# Apply the function to train_df and test_df
train_df = convert_time_to_features(train_df)
test_df = convert_time_to_features(test_df)


display(train_df)
display(test_df)


Unnamed: 0,time,latitude,longitude,vesselId,portId,shippingLineId,CEU,DWT,GT,vesselType,...,countryName,ISO,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,sin_day,cos_day
0,1.704067e+09,-34.74370,-57.85130,51,61d371c43aeaecc07011a37f,61ec65aea8cafc0e93f0e900,6500,21214.0,57718,83.0,...,Chile,CL,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
1,1.704067e+09,8.89440,-79.47939,198,634c4de270937fc01c3a7689,61be24564ea00ae59d0fe37a,5174,18878.0,59583,83.0,...,Panama,PA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
2,1.704067e+09,39.19065,-76.47567,450,61d3847bb7b7526e1adf3d19,61be24564ea00ae59d0fe379,6402,18383.0,59217,83.0,...,United States,US,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
3,1.704067e+09,-34.41189,151.02067,114,61d36f770a1807568ff9a126,61a8e672f9cba188601e84ac,5849,15199.0,55598,83.0,...,Australia,AU,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
4,1.704067e+09,35.88379,-5.91636,370,634c4de270937fc01c3a74f3,61be24564ea00ae59d0fe37a,5219,18833.0,58939,83.0,...,Morocco,MA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,1.715126e+09,52.19131,-5.82223,705,634c4de270937fc01c3a7417,61a8e673f9cba188601e84b3,300,12502.0,21005,83.0,...,Ireland,IE,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522061,1.715126e+09,38.96142,-12.00502,89,634c4de270937fc01c3a76a1,61a8e672f9cba188601e84ac,6354,22160.0,61328,83.0,...,Portugal,PT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522062,1.715126e+09,49.71372,-5.22042,477,634c4de270937fc01c3a787b,61a8e673f9cba188601e84b9,7429,18241.0,72700,83.0,...,United Kingdom,GB,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522063,1.715126e+09,38.27895,10.78280,619,61d3781293c6feb83e5eb73b,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,...,Italy,IT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428


Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,yearBuilt,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,sin_day,cos_day
0,0,86,1.715127e+09,61a8e672f9cba188601e84ac,7934,31143.0,74255,83.0,32.00,230.00,2011,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649
1,1,646,1.715127e+09,61be24574ea00ae59d0fe388,2500,13238.0,9984,14.0,20.00,124.00,2012,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649
2,2,619,1.715127e+09,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,27.00,186.00,2003,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649
3,3,562,1.715127e+09,61be24564ea00ae59d0fe37a,5007,13951.0,45959,83.0,30.20,183.00,2011,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649
4,4,1,1.715127e+09,61ec94f1a8cafc0e93f0e92a,4902,12325.0,46800,83.0,31.00,182.00,2006,0.000000,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51734,51734,49,1.715558e+09,61a8e672f9cba188601e84ab,6000,19670.0,59705,83.0,32.00,199.00,2006,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758
51735,51735,114,1.715558e+09,61a8e672f9cba188601e84ac,5849,15199.0,55598,83.0,32.00,199.00,1995,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758
51736,51736,633,1.715558e+09,61a8e673f9cba188601e84b9,6178,18770.0,59516,83.0,32.26,199.97,2015,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758
51737,51737,595,1.715558e+09,61ec643ca8cafc0e93f0e8f9,840,9653.0,45923,21.0,30.00,218.00,2006,-0.258819,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758


In [5]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm import tqdm

# Predict the port_longitude and port_latitude for each vessel in testset
# Define features and target
features_port = ['vesselId', 'sin_minute', 'cos_minute', 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'sin_month', 'cos_month', 'time', 'CEU', 'DWT', 'GT', 'vesselType', 'breadth', 'length', 'yearBuilt']
target_port = ['port_longitude', 'port_latitude']

# Get unique vessel IDs
unique_vessels = train_df['vesselId'].unique()

# Initialize a dictionary to store models for each vessel
vessel_models = {}

# Loop through each vessel and train a model
for vessel_id in tqdm(unique_vessels, desc="Training models for each vessel"):
    # Filter the data for the current vessel
    vessel_train_df = train_df[train_df['vesselId'] == vessel_id]
    
    # Check if the vessel_id is in the test set
    if vessel_id in test_df['vesselId'].values:
        vessel_test_df = test_df[test_df['vesselId'] == vessel_id]
        
        # Split the data into train and validation sets
        x_train_port, x_val_port, y_train_port, y_val_port = train_test_split(vessel_train_df[features_port], vessel_train_df[target_port], test_size=0.3, random_state=False)
        
        # Initialize and train the model
        model_port = xgb.XGBRegressor(
            n_estimators=2000,
            n_jobs=-1,
            early_stopping_rounds=50,
            learning_rate=0.01,
            max_depth=10,
        )
        
        model_port.fit(
            x_train_port, y_train_port,
            verbose=False,
            eval_set=[(x_train_port, y_train_port), (x_val_port, y_val_port)],
        )
        
        # Store the model in the dictionary
        vessel_models[vessel_id] = model_port
        
        # Predict the port_longitude and port_latitude for the current vessel in the test set
        preds = model_port.predict(vessel_test_df[features_port])
        test_df.loc[test_df['vesselId'] == vessel_id, 'predicted_port_longitude'] = preds[:, 0]
        test_df.loc[test_df['vesselId'] == vessel_id, 'predicted_port_latitude'] = preds[:, 1]

display(test_df.head())

Training models for each vessel: 100%|██████████| 688/688 [11:28<00:00,  1.00s/it]


Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,...,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,sin_day,cos_day,predicted_port_longitude,predicted_port_latitude
0,0,86,1715127000.0,61a8e672f9cba188601e84ac,7934,31143.0,74255,83.0,32.0,230.0,...,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,-81.496384,31.140429
1,1,646,1715127000.0,61be24574ea00ae59d0fe388,2500,13238.0,9984,14.0,20.0,124.0,...,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,120.626755,34.827389
2,2,619,1715127000.0,61ec6303a8cafc0e93f0e8f3,1400,7150.0,25995,21.0,27.0,186.0,...,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,11.777119,43.154316
3,3,562,1715127000.0,61be24564ea00ae59d0fe37a,5007,13951.0,45959,83.0,30.2,183.0,...,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,156.670853,-43.46793
4,4,1,1715127000.0,61ec94f1a8cafc0e93f0e92a,4902,12325.0,46800,83.0,31.0,182.0,...,0.0,1.0,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,-4.299913,43.794685


In [6]:
from geopy.distance import geodesic
from joblib import Parallel, delayed

# Calculate the distance between the predicted and actual port coordinates

# Ensure each portId is unique
port_coordinates = ports_df[['portId', 'port_longitude', 'port_latitude']].drop_duplicates(subset='portId')
port_coordinates = port_coordinates.set_index('portId')

# Remove nan port coordinates
port_coordinates = port_coordinates.dropna()

# Fill missing values in test data
test_df['predicted_port_longitude'] = test_df['predicted_port_longitude'].fillna(test_df['predicted_port_longitude'].mean())
test_df['predicted_port_latitude'] = test_df['predicted_port_latitude'].fillna(test_df['predicted_port_latitude'].mean())

display(port_coordinates)

# Check for nan values in the predicted port coordinates
print(test_df['predicted_port_longitude'].isna().sum())

# Vectorized function to find the closest port
def find_closest_port_vectorized(predicted_lat, predicted_lon):
    predicted_port = (predicted_lat, predicted_lon)
    distances = port_coordinates.apply(lambda x: geodesic(predicted_port, (x['port_latitude'], x['port_longitude'])).m, axis=1)
    closest_port = distances.idxmin()
    return closest_port

# Apply the function in parallel
test_df['closest_port'] = Parallel(n_jobs=-1)(delayed(find_closest_port_vectorized)(row['predicted_port_latitude'], row['predicted_port_longitude']) for _, row in test_df.iterrows())


Unnamed: 0_level_0,port_longitude,port_latitude
portId,Unnamed: 1_level_1,Unnamed: 2_level_1
61d36ed80a1807568ff9a064,3.067222,36.773611
61d36ed80a1807568ff9a065,7.772500,36.900556
61d36edf0a1807568ff9a070,-0.639722,35.712222
61d36ee00a1807568ff9a072,6.905833,36.887500
61d36ee10a1807568ff9a074,-170.690556,-14.274167
...,...,...
6367c622aa0cd188707a1eae,25.029460,51.624500
6367cb14aa0cd188707a1eaf,50.667890,26.201200
6367d9afaa0cd188707a1eb3,34.992790,29.519880
663cb287244f580d97afce53,-2.648735,51.471346


0


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on the validation set
y_val_port_pred = model_port.predict(x_val_port)

# Calculate MAE and MSE
mae = mean_absolute_error(y_val_port, y_val_port_pred)
mse = mean_squared_error(y_val_port, y_val_port_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")

Mean Absolute Error (MAE): 0.5910410530055376
Mean Squared Error (MSE): 29.80057089845544


### Fit model

In [8]:


# Rename "closest_port" to "portId" and "predicted_port_longitude" to "port_longitude" and "predicted_port_latitude" to "port_latitude"
test_df = test_df.rename(columns={'closest_port': 'portId', 'predicted_port_longitude': 'port_longitude', 'predicted_port_latitude': 'port_latitude'})

# Label encode transform portId
le_portId = LabelEncoder()
le_portId.fit(pd.concat([train_df['portId'], test_df['portId']]))
train_df['portId'] = le_portId.transform(train_df['portId'])
test_df['portId'] = le_portId.transform(test_df['portId'])

# Label encode shippingLineId
le_shippingLineId = LabelEncoder()
le_shippingLineId.fit(pd.concat([train_df['shippingLineId'], test_df['shippingLineId']]))
train_df['shippingLineId'] = le_shippingLineId.transform(train_df['shippingLineId'])
test_df['shippingLineId'] = le_shippingLineId.transform(test_df['shippingLineId'])


display(test_df)
display(train_df)


Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,...,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,sin_day,cos_day,port_longitude,port_latitude,portId
0,0,86,1.715127e+09,1,7934,31143.0,74255,83.0,32.00,230.00,...,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,-81.496384,31.140429,420
1,1,646,1.715127e+09,18,2500,13238.0,9984,14.0,20.00,124.00,...,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,120.626755,34.827389,63
2,2,619,1.715127e+09,21,1400,7150.0,25995,21.0,27.00,186.00,...,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,11.777119,43.154316,658
3,3,562,1.715127e+09,15,5007,13951.0,45959,83.0,30.20,183.00,...,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,156.670853,-43.467930,464
4,4,1,1.715127e+09,25,4902,12325.0,46800,83.0,31.00,182.00,...,1.000000,0.974928,-0.222521,0.5,-0.866025,0.998717,-0.050649,-4.299913,43.794685,802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51734,51734,49,1.715558e+09,0,6000,19670.0,59705,83.0,32.00,199.00,...,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758,6.307422,43.939068,107
51735,51735,114,1.715558e+09,1,5849,15199.0,55598,83.0,32.00,199.00,...,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758,145.198303,45.081486,787
51736,51736,633,1.715558e+09,8,6178,18770.0,59516,83.0,32.26,199.97,...,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758,138.415833,35.163105,221
51737,51737,595,1.715558e+09,22,840,9653.0,45923,21.0,30.00,218.00,...,0.965926,-0.781831,0.623490,0.5,-0.866025,0.651372,-0.758758,21.309885,56.381481,683


Unnamed: 0,time,latitude,longitude,vesselId,portId,shippingLineId,CEU,DWT,GT,vesselType,...,countryName,ISO,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_month,cos_month,sin_day,cos_day
0,1.704067e+09,-34.74370,-57.85130,51,48,23,6500,21214.0,57718,83.0,...,Chile,CL,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
1,1.704067e+09,8.89440,-79.47939,198,772,15,5174,18878.0,59583,83.0,...,Panama,PA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
2,1.704067e+09,39.19065,-76.47567,450,399,14,6402,18383.0,59217,83.0,...,United States,US,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
3,1.704067e+09,-34.41189,151.02067,114,18,1,5849,15199.0,55598,83.0,...,Australia,AU,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
4,1.704067e+09,35.88379,-5.91636,370,693,15,5219,18833.0,58939,83.0,...,Morocco,MA,0.000000,1.000000,0.000000,1.00000,0.5,0.866025,0.201299,0.979530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,1.715126e+09,52.19131,-5.82223,705,648,5,300,12502.0,21005,83.0,...,Ireland,IE,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522061,1.715126e+09,38.96142,-12.00502,89,776,1,6354,22160.0,61328,83.0,...,Portugal,PT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522062,1.715126e+09,49.71372,-5.22042,477,858,8,7429,18241.0,72700,83.0,...,United Kingdom,GB,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428
1522063,1.715126e+09,38.27895,10.78280,619,154,21,1400,7150.0,25995,21.0,...,Italy,IT,-0.258819,0.965926,0.781831,0.62349,0.5,-0.866025,0.988468,0.151428


In [12]:
from tqdm import tqdm

features_main = ['vesselId', 'sin_minute', 'cos_minute', 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'sin_month', 'cos_month', 'time', 'CEU', 'DWT', 'GT', 'vesselType', 'breadth', 'length', 'yearBuilt', 'port_longitude', 'port_latitude', 'portId']
targets_main = ['latitude', 'longitude']

# Get unique vessel IDs
unique_vessels = train_df['vesselId'].unique()

# Initialize a dictionary to store models for each vessel
vessel_models = {}

# Loop through each vessel and train a model
# Filter the unique vessel IDs that are present in the test set
unique_vessels_in_test = test_df['vesselId'].unique()

for vessel_id in tqdm(unique_vessels_in_test, desc="Training models for each vessel in test set"):
    # Filter the data for the current vessel
    vessel_train_df = train_df[train_df['vesselId'] == vessel_id]
    
    # Split the data into train and validation sets
    x_train, x_val, y_train, y_val = train_test_split(vessel_train_df[features_main], vessel_train_df[targets_main], test_size=0.3, random_state=False)

    
    # Initialize and train the model
    model = xgb.XGBRegressor(
        n_estimators=10000,
        n_jobs=-1,
        early_stopping_rounds=50,
        learning_rate=0.01,
        max_depth=10,
    )
    
    model.fit(
        x_train, y_train,
        verbose=False,
        eval_set=[(x_train, y_train), (x_val, y_val)],
    )
    
    # Store the model in the dictionary
    vessel_models[vessel_id] = model
    
    # Predict the latitude and longitude for the current vessel in the test set
    vessel_test_df = test_df[test_df['vesselId'] == vessel_id]
    preds = model.predict(vessel_test_df[features_main])
    test_df.loc[test_df['vesselId'] == vessel_id, 'latitude_predicted'] = preds[:, 0]
    test_df.loc[test_df['vesselId'] == vessel_id, 'longitude_predicted'] = preds[:, 1]



Training models for each vessel in test set: 100%|██████████| 215/215 [21:02<00:00,  5.87s/it]


### Predict

In [13]:
# Give metrics on the model
print(model.score(x_train, y_train))
print(model.score(x_val, y_val))


0.9999982118606567
0.963367223739624


In [15]:
# Initialize an empty list to store predictions
all_preds = []

# Loop through each vessel and predict positions
for vessel_id in unique_vessels:
    # Filter the test data for the current vessel
    vessel_test_df = test_df[test_df['vesselId'] == vessel_id]
    
    # Check if there is a model for the current vessel
    if vessel_id in vessel_models:
        model = vessel_models[vessel_id]
        
        # Predict positions
        preds = model.predict(vessel_test_df[features_main])
        
        # Store the predictions
        all_preds.append((vessel_test_df['ID'], preds))

# Combine all predictions into a single DataFrame
preds_df = pd.concat([pd.DataFrame({'ID': ids, 'longitude_predicted': pred[:, 0], 'latitude_predicted': pred[:, 1]}) for ids, pred in all_preds])

# Merge the predictions with the test_df
#test_df = test_df.merge(preds_df, on='ID', how='left')

display(test_df.head())

Unnamed: 0,ID,vesselId,time,shippingLineId,CEU,DWT,GT,vesselType,breadth,length,...,cos_weekday,sin_month,cos_month,sin_day,cos_day,port_longitude,port_latitude,portId,latitude_predicted,longitude_predicted
0,0,86,1715127000.0,1,7934,31143.0,74255,83.0,32.0,230.0,...,-0.222521,0.5,-0.866025,0.998717,-0.050649,-81.496384,31.140429,420,30.611847,-81.37944
1,1,646,1715127000.0,18,2500,13238.0,9984,14.0,20.0,124.0,...,-0.222521,0.5,-0.866025,0.998717,-0.050649,120.626755,34.827389,63,25.166702,120.270493
2,2,619,1715127000.0,21,1400,7150.0,25995,21.0,27.0,186.0,...,-0.222521,0.5,-0.866025,0.998717,-0.050649,11.777119,43.154316,658,40.016697,10.661958
3,3,562,1715127000.0,15,5007,13951.0,45959,83.0,30.2,183.0,...,-0.222521,0.5,-0.866025,0.998717,-0.050649,156.670853,-43.46793,464,-43.426567,151.251053
4,4,1,1715127000.0,25,4902,12325.0,46800,83.0,31.0,182.0,...,-0.222521,0.5,-0.866025,0.998717,-0.050649,-4.299913,43.794685,802,46.922001,-5.022119


### Eksporter til csv

In [17]:
submission_df = test_df[['ID', 'longitude_predicted','latitude_predicted']]

# Rename columns
submission_df = submission_df.rename(columns={'longitude_predicted': 'longitude_predicted', 'latitude_predicted': 'latitude_predicted'})

submission_df.to_csv('predictions_24oct.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
