In [None]:
from settings import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
def process_data(train_data_path, test_data_path, delimiter="|"):
    train_data = pd.read_csv(train_data_path, delimiter=delimiter)
    test_data = pd.read_csv(test_data_path)
    test_data = test_data.drop(columns=["ID", "scaling_factor"])

    train_data['time'] = pd.to_datetime(train_data['time'])

    train_data_sorted = train_data.sort_values(by=["vesselId", "time"], ascending=True).copy()

    train_data_sorted['previous_lat'] = train_data_sorted.groupby('vesselId')['latitude'].shift(1)
    train_data_sorted['previous_lon'] = train_data_sorted.groupby('vesselId')['longitude'].shift(1)

    last_entries = train_data_sorted.groupby('vesselId').last().reset_index()
    
    test_data_merged = test_data.merge(last_entries, on='vesselId', how='left')
    test_data_merged = test_data_merged.drop(columns=['time_y']).rename(columns={'time_x': 'time'})

    return train_data_sorted, test_data_merged

train_data_path = AIS_TRAIN 
test_data_path = AIS_TEST

processed_train_data, processed_test_data = process_data(train_data_path, test_data_path)

display(processed_test_data.head())

# Load data
X_test = processed_test_data
X_test = X_test.drop(columns=["longitude", "latitude", "etaRaw"])

# Prepare features and target variables
y_train = processed_train_data[["latitude", "longitude"]].copy()
X_train = processed_train_data.copy()

X_train = X_train.drop(columns=['time'])

# Apply the same preprocessing to the test set
X_test = processed_test_data.copy()
X_test = X_test.drop(columns=['time'])

# Encode categorical variables 'vesselId' and 'portId' using LabelEncoder
label_encoder = LabelEncoder()
X_train['vesselId'] = label_encoder.fit_transform(X_train['vesselId'].astype(str))
X_test['vesselId'] = label_encoder.transform(X_test['vesselId'].astype(str))  # Use transform to avoid data leakage

X_train['portId'] = label_encoder.fit_transform(X_train['portId'].astype(str))
X_test['portId'] = label_encoder.transform(X_test['portId'].astype(str)) # Use transform to avoid data leakage

# Select relevant features for training and testing
X_train = X_train[['cog', 'sog', 'previous_lat', 'previous_lon', 'portId', "heading","vesselId"]]
X_test = X_test[['cog', 'sog', 'previous_lat', 'previous_lon', 'portId', "heading","vesselId"]]


In [None]:
best_params = {
    'colsample_bytree': 0.8, 
  'learning_rate': 0.1, 
  'max_depth': 6,
  'min_child_weight': 1,
  'n_estimators': 100,
  'subsample': 1.0
}

xreg = xgb.XGBRegressor()

multi_regressor = MultiOutputRegressor(xreg)
multi_regressor.fit(X_train, y_train)

In [None]:
param_grid = {
    'estimator__n_estimators': [100,200,300],  
    'estimator__max_depth': [4, 5,6], 
    'estimator__learning_rate': [0.01, 0.1,0.005], 
    'estimator__subsample': [0.9,1.0],
    'estimator__min_child_weight': [1,2], 
    'estimator__colsample_bytree': [0.7,0.8] 
}

grid_search = GridSearchCV(
    estimator=multi_regressor,
    param_grid=param_grid,
    cv=5,
    verbose=10,
    n_jobs=-1 
)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:

# Make predictions over X_test with result of grid schearch 
y_pred_default = grid_search.predict(X_test)

# Make predictions over X_test with fix params 
#y_pred_default = multi_regressor.predict(X_test)

# Convert the NumPy array to a pandas DataFrame
y_pred_default_df = pd.DataFrame(y_pred_default, columns=['latitude_predicted', 'longitude_predicted'])

# Add the ID column, which starts from 0 and increments by 1 for each row
y_pred_default_df['ID'] = range(len(y_pred_default_df))

# Reorder the columns to ensure 'id' is the first column (if required by the submission format)
y_pred_default_df = y_pred_default_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save the predictions to a CSV file
from utils import make_file_name
from settings import * 

output_path = SUBMISSION_FODLER.joinpath(make_file_name() + '.csv')
y_pred_default_df.to_csv(output_path, index=False)

y_pred_default_df.head()