In [1]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [14]:
def process_data(train_data_path, test_data_path, delimiter="|"):
    # Load the training and test data
    train_data = pd.read_csv(train_data_path, delimiter=delimiter)
    test_data = pd.read_csv(test_data_path)
    test_data = test_data.drop(columns=["ID", "scaling_factor"])

    # Convert time column to datetime
    train_data['time'] = pd.to_datetime(train_data['time'])

    # Sort the training data by vesselId and time to ensure proper alignment for shifting
    train_data_sorted = train_data.sort_values(by=["vesselId", "time"], ascending=True).copy()

    # Shift latitude and longitude for each vessel to create "previous_lat" and "previous_lon"
    train_data_sorted['previous_lat'] = train_data_sorted.groupby('vesselId')['latitude'].shift(1)
    train_data_sorted['previous_lon'] = train_data_sorted.groupby('vesselId')['longitude'].shift(1)

    # Get the last entry per vessel in the training data
    last_entries = train_data_sorted.groupby('vesselId').last().reset_index()
    
    # Merge the last row of the training data with the test data based on vesselId
    test_data_merged = test_data.merge(last_entries, on='vesselId', how='left')
    test_data_merged = test_data_merged.drop(columns=['time_y']).rename(columns={'time_x': 'time'})

    return train_data_sorted, test_data_merged

# Usage
from settings import *

train_data_path = AIS_TRAIN # '../data/ais_train.csv'
test_data_path = AIS_TEST # '../data/ais_test.csv'

processed_train_data, processed_test_data = process_data(train_data_path, test_data_path)

# To save the processed test data
processed_test_data.head()

                       vesselId                 time
0      61e9f3aeb937134a3c4bfe3d  2024-05-08 00:03:16
1      61e9f473b937134a3c4c02df  2024-05-08 00:06:17
2      61e9f469b937134a3c4c029b  2024-05-08 00:10:02
3      61e9f45bb937134a3c4c0221  2024-05-08 00:10:34
4      61e9f38eb937134a3c4bfd8d  2024-05-08 00:12:27
...                         ...                  ...
51734  61e9f3a8b937134a3c4bfdf3  2024-05-12 23:59:58
51735  61e9f3b4b937134a3c4bfe77  2024-05-12 23:59:58
51736  61e9f46cb937134a3c4c02b7  2024-05-12 23:59:58
51737  61e9f465b937134a3c4c0269  2024-05-12 23:59:58
51738  61e9f3adb937134a3c4bfe39  2024-05-12 23:59:58

[51739 rows x 2 columns]


Unnamed: 0,vesselId,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,portId,previous_lat,previous_lon
0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,179.6,0.0,0,344,5,05-06 10:45,31.14647,-81.49789,61d38499b7b7526e1adf3d54,31.14648,-81.49789
1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,24.7,0.0,0,214,5,05-01 23:00,14.81694,120.29625,61d37d5799db2ccf7339ef3b,14.81694,120.29624
2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,8.0,18.7,0,6,0,05-08 12:45,38.27895,10.7828,61d3781293c6feb83e5eb73b,38.14875,10.75635
3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,321.3,0.1,0,70,1,05-07 01:15,-43.53785,172.83522,61d37bfe99db2ccf7339ece3,-43.53815,172.83516
4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,291.0,0.3,0,275,2,05-09 04:00,48.5332,-6.12003,61d3743d3aeaecc07011a6fa,48.53133,-6.1075


In [12]:
test_data

NameError: name 'test_data' is not defined

In [3]:
processed_test_data

Unnamed: 0,vesselId,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,portId,previous_lat,previous_lon
0,61e9f3aeb937134a3c4bfe3d,2024-05-08 00:03:16,179.6,0.0,0,344,5,05-06 10:45,31.14647,-81.49789,61d38499b7b7526e1adf3d54,31.14648,-81.49789
1,61e9f473b937134a3c4c02df,2024-05-08 00:06:17,24.7,0.0,0,214,5,05-01 23:00,14.81694,120.29625,61d37d5799db2ccf7339ef3b,14.81694,120.29624
2,61e9f469b937134a3c4c029b,2024-05-08 00:10:02,8.0,18.7,0,6,0,05-08 12:45,38.27895,10.78280,61d3781293c6feb83e5eb73b,38.14875,10.75635
3,61e9f45bb937134a3c4c0221,2024-05-08 00:10:34,321.3,0.1,0,70,1,05-07 01:15,-43.53785,172.83522,61d37bfe99db2ccf7339ece3,-43.53815,172.83516
4,61e9f38eb937134a3c4bfd8d,2024-05-08 00:12:27,291.0,0.3,0,275,2,05-09 04:00,48.53320,-6.12003,61d3743d3aeaecc07011a6fa,48.53133,-6.10750
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51734,61e9f3a8b937134a3c4bfdf3,2024-05-12 23:59:58,5.7,15.6,0,10,0,05-12 20:00,26.22892,-79.40341,61d375e793c6feb83e5eb3e2,26.15166,-79.41257
51735,61e9f3b4b937134a3c4bfe77,2024-05-12 23:59:58,87.6,14.4,0,88,0,05-10 12:00,41.64055,143.29942,61d37a131366c3998241d90b,41.63254,141.92751
51736,61e9f46cb937134a3c4c02b7,2024-05-12 23:59:58,76.9,15.9,0,78,0,05-10 06:50,35.33234,142.69115,61d379911366c3998241d850,35.30791,142.56395
51737,61e9f465b937134a3c4c0269,2024-05-12 23:59:58,232.0,22.0,-13,234,0,05-08 19:00,59.26571,21.98971,61d375f793c6feb83e5eb402,59.41452,22.36688


In [4]:
# Load data
X_test = processed_test_data
X_test = X_test.drop(columns=["longitude", "latitude", "etaRaw"])

# Split training data into features and target variables
y_train = processed_train_data[["latitude", "longitude"]].copy()
X_train = processed_train_data.copy()
X_train = X_train.drop(columns=["longitude", "latitude", "etaRaw"])


# Basic preprocessing (converting time and etaRaw to numerical features)
X_train['time'] = pd.to_datetime(X_train['time'], errors='coerce')

X_train['year'] = X_train['time'].dt.year
X_train['month'] = X_train['time'].dt.month
X_train['day'] = X_train['time'].dt.day
X_train['hour'] = X_train['time'].dt.hour

# Drop the original time and etaRaw columns after extracting useful features
X_train = X_train.drop(columns=['time'])

# Apply the same preprocessing to the test set
X_test['time'] = pd.to_datetime(X_test['time'], errors='coerce')

X_test['year'] = X_test['time'].dt.year
X_test['month'] = X_test['time'].dt.month
X_test['day'] = X_test['time'].dt.day
X_test['hour'] = X_test['time'].dt.hour

X_test = X_test.drop(columns=['time'])

# Encode categorical variables 'vesselId' and 'portId' using LabelEncoder
label_encoder = LabelEncoder()

X_train['vesselId'] = label_encoder.fit_transform(X_train['vesselId'].astype(str))

#Badpractice, should be label_encoder.transform() but this gives an error so instead i use fit_transform
X_test['vesselId'] = label_encoder.fit_transform(X_test['vesselId'].astype(str))

X_train['portId'] = label_encoder.fit_transform(X_train['portId'].astype(str))

#Badpractice, should be label_encoder.transform() but this gives an error so instead i use fit_transform
X_test['portId'] = label_encoder.fit_transform(X_test['portId'].astype(str))

X_train = X_train[['cog', 'sog', 'rot', 'heading', 'navstat', 'vesselId', 'portId', 'previous_lat', 'previous_lon']]
X_test = X_test[['cog', 'sog', 'rot', 'heading', 'navstat', 'vesselId', 'portId', 'previous_lat', 'previous_lon']]

In [5]:
params = {
    # 'n_estimators': 5000,
    'gamma': 0.5,
    'subsample': 0.6,
    'n_estimators': 5000,
    'min_child_weight':  15,
    'colsample_bytree': 0.8,
    'max_depth': 4,
    'eta':  0.005,
    'refresh_leaf': 1,
}

In [6]:
# Initialize the XGBoost Regressor
xgb_reg = xgb.XGBRegressor()

# MultiOutputRegressor to handle the two outputs (latitude and longitude)
multi_regressor = MultiOutputRegressor(xgb_reg)

# Train the model
multi_regressor.fit(X_train, y_train)

In [9]:
# Make predictions over X_test
y_pred_default = multi_regressor.predict(X_test)

# Convert the NumPy array to a pandas DataFrame
y_pred_default_df = pd.DataFrame(y_pred_default, columns=['latitude_predicted', 'longitude_predicted'])

# Add the ID column, which starts from 0 and increments by 1 for each row
y_pred_default_df['ID'] = range(len(y_pred_default_df))

# Reorder the columns to ensure 'id' is the first column (if required by the submission format)
y_pred_default_df = y_pred_default_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save the predictions to a CSV file
from utils import make_file_name
from settings import * 

output_path = SUBMISSION_FODLER.joinpath(make_file_name() + '.csv')
y_pred_default_df.to_csv(output_path, index=False)

y_pred_default_df.head()

Submission file name is: 1f27629e-be34-4f20-b36b-af9e7dc3d465


Unnamed: 0,ID,longitude_predicted,latitude_predicted
0,0,-80.89949,31.018854
1,1,116.623177,15.240631
2,2,10.669207,38.411983
3,3,157.230957,-43.2589
4,4,7.711272,46.944344
