In [1]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [None]:
def process_data(train_data_path, test_data_path, delimiter="|"):
    # Load the training and test data
    train_data = pd.read_csv(train_data_path, delimiter=delimiter)
    test_data = pd.read_csv(test_data_path)
    test_data = test_data.drop(columns=["ID", "scaling_factor"])

    # Convert time column to datetime
    train_data['time'] = pd.to_datetime(train_data['time'])

    # Sort the training data by vesselId and time to ensure proper alignment for shifting
    train_data_sorted = train_data.sort_values(by=["vesselId", "time"], ascending=True).copy()

    # Shift latitude and longitude for each vessel to create "previous_lat" and "previous_lon"
    train_data_sorted['previous_lat'] = train_data_sorted.groupby('vesselId')['latitude'].shift(1)
    train_data_sorted['previous_lon'] = train_data_sorted.groupby('vesselId')['longitude'].shift(1)

    # Get the last entry per vessel in the training data
    last_entries = train_data_sorted.groupby('vesselId').last().reset_index()
    
    # Merge the last row of the training data with the test data based on vesselId
    test_data_merged = test_data.merge(last_entries, on='vesselId', how='left')
    test_data_merged = test_data_merged.drop(columns=['time_y']).rename(columns={'time_x': 'time'})

    return train_data_sorted, test_data_merged

# Usage
from settings import *

train_data_path = AIS_TRAIN # '../data/ais_train.csv'
test_data_path = AIS_TEST # '../data/ais_test.csv'

processed_train_data, processed_test_data = process_data(train_data_path, test_data_path)

# To save the processed test data
processed_test_data.head()

In [None]:
test_data

In [None]:
processed_test_data

In [4]:
# Load data
X_test = processed_test_data
X_test = X_test.drop(columns=["longitude", "latitude", "etaRaw"])

# Split training data into features and target variables
y_train = processed_train_data[["latitude", "longitude"]].copy()
X_train = processed_train_data.copy()
X_train = X_train.drop(columns=["longitude", "latitude", "etaRaw"])


# Basic preprocessing (converting time and etaRaw to numerical features)
X_train['time'] = pd.to_datetime(X_train['time'], errors='coerce')

X_train['year'] = X_train['time'].dt.year
X_train['month'] = X_train['time'].dt.month
X_train['day'] = X_train['time'].dt.day
X_train['hour'] = X_train['time'].dt.hour

# Drop the original time and etaRaw columns after extracting useful features
X_train = X_train.drop(columns=['time'])

# Apply the same preprocessing to the test set
X_test['time'] = pd.to_datetime(X_test['time'], errors='coerce')

X_test['year'] = X_test['time'].dt.year
X_test['month'] = X_test['time'].dt.month
X_test['day'] = X_test['time'].dt.day
X_test['hour'] = X_test['time'].dt.hour

X_test = X_test.drop(columns=['time'])

# Encode categorical variables 'vesselId' and 'portId' using LabelEncoder
label_encoder = LabelEncoder()

X_train['vesselId'] = label_encoder.fit_transform(X_train['vesselId'].astype(str))

#Badpractice, should be label_encoder.transform() but this gives an error so instead i use fit_transform
X_test['vesselId'] = label_encoder.fit_transform(X_test['vesselId'].astype(str))

X_train['portId'] = label_encoder.fit_transform(X_train['portId'].astype(str))

#Badpractice, should be label_encoder.transform() but this gives an error so instead i use fit_transform
X_test['portId'] = label_encoder.fit_transform(X_test['portId'].astype(str))

X_train = X_train[['cog', 'sog', 'rot', 'heading', 'navstat', 'vesselId', 'portId', 'previous_lat', 'previous_lon']]
X_test = X_test[['cog', 'sog', 'rot', 'heading', 'navstat', 'vesselId', 'portId', 'previous_lat', 'previous_lon']]

In [5]:
params = {
    # 'n_estimators': 5000,
    'gamma': 0.5,
    'subsample': 0.6,
    'n_estimators': 5000,
    'min_child_weight':  15,
    'colsample_bytree': 0.8,
    'max_depth': 4,
    'eta':  0.005,
    'refresh_leaf': 1,
}

In [None]:
# Initialize the XGBoost Regressor
xgb_reg = xgb.XGBRegressor()

# MultiOutputRegressor to handle the two outputs (latitude and longitude)
multi_regressor = MultiOutputRegressor(xgb_reg)

# Train the model
multi_regressor.fit(X_train, y_train)

In [None]:
# Make predictions over X_test
y_pred_default = multi_regressor.predict(X_test)

# Convert the NumPy array to a pandas DataFrame
y_pred_default_df = pd.DataFrame(y_pred_default, columns=['latitude_predicted', 'longitude_predicted'])

# Add the ID column, which starts from 0 and increments by 1 for each row
y_pred_default_df['ID'] = range(len(y_pred_default_df))

# Reorder the columns to ensure 'id' is the first column (if required by the submission format)
y_pred_default_df = y_pred_default_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save the predictions to a CSV file
from utils import make_file_name
from settings import * 

output_path = SUBMISSION_FODLER.joinpath(make_file_name() + '.csv')
y_pred_default_df.to_csv(output_path, index=False)

y_pred_default_df.head()