In [152]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV


In [153]:
# Read data sets
X_train = pd.read_csv('ais_train.csv', sep='|')
X_test = pd.read_csv('ais_test.csv')

# Import ports data
ports = pd.read_csv("ports.csv", sep="|")

# Import vessels data
vessels = pd.read_csv("vessels.csv", sep="|")

#Import schedules data
schedules = pd.read_csv("schedules_to_may_2024.csv", sep="|")

## Preprocessing and feature engineering

In [154]:
def preprocess(df_train,df_test):
    train = df_train.copy()
    test = df_test.copy()

    # Format time
    train['time'] = pd.to_datetime(train['time'])
    test['time'] = pd.to_datetime(test['time'])

    # Factorize the 'vesselID' column in X_train and get the integer IDs and the mapping
    vesselID, vesselID_mapping = pd.factorize(train['vesselId'])

    # Replace 'vessel_ID' column in X_train with integer IDs
    train['vesselId'] = vesselID

    # Create a dictionary from the mapping to apply the same to X_test
    vessel_to_ID = {vessel: idx for idx, vessel in enumerate(vesselID_mapping)}

    # Replace 'vesselID' in X_test using the same mapping from X_train
    test['vesselId'] = test['vesselId'].map(vessel_to_ID)
    
    # Replace 'portId' column with integer IDs
    train['portId'] = pd.factorize(train['portId'])[0]

    # Remove sog outliers
    #train = train[train['sog'] <= 40]

    return train, test

In [155]:
def feature_engineering(df_train,df_test):  
    train = df_train.copy()
    test = df_test.copy()
    features = pd.DataFrame()
    
    # Add the columns vesselId, time, latitude and longitude to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']

    # Sort by vesselID then time
    features = features.sort_values(['vesselId','time'])

    # Add the columns last_longitude and last_latitude for every row in train
    features['last_latitude'] = train.groupby('vesselId')['latitude'].shift()    
    features['last_longitude'] = train.groupby('vesselId')['longitude'].shift()

    # Remove the first row for every vesselId
    features = features.dropna()

    # New feature for if the vessel is moored or not
    features['not_under_way'] = train['navstat'].apply(lambda x: 1 if x == 5 or x == 1 else 0)
    features['under_way'] = train['navstat'].apply(lambda x: 1 if x == 0 or x == 8 else 0)

    # Add the column cog, sog, and rot to the features from train
    features['cog'] = train['cog']
    features['sog'] = train['sog']
    features['rot'] = train['rot']
    features['heading'] = train['heading']

    # Extract calendar features for 'etaRaw'
    features[['etaMonth', 'etaDay', 'etaHour', 'etaMinute']] = train['etaRaw'].str.extract(r'(\d{2})-(\d{2}) (\d{2}):(\d{2})')
    # Convert objects to integers
    features[['etaMonth', 'etaDay', 'etaHour', 'etaMinute']] = features[['etaMonth', 'etaDay', 'etaHour', 'etaMinute']].astype(int)

    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop(columns=['time'], inplace=True)

    return features, test

In [156]:
features,test = preprocess(X_train,X_test)
features,test = feature_engineering(features,test)

## Modelling

In [157]:
# Define features and targets
y = features[['latitude', 'longitude']]
x = features.drop(columns=['latitude', 'longitude'])

In [158]:
# Create a random forest regressor
#rf_model = RandomForestRegressor()

# Train the model
#rf_model.fit(x,y)

In [159]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(x,y)

## Predictions

In [160]:
# Find the last observed values for each vessel
def last_observed(df):
    last_obs = df.groupby('vesselId').last().reset_index()
    return last_obs

In [161]:
def prepare_test_for_predictions(test, features):
    test = test.copy()
    features = features.copy()

    # Find the last observed values for each vessel
    last_obs = last_observed(features)
    last_obs = last_obs.drop(columns=['last_longitude', 'last_latitude', 'month', 'day', 'hour', 'minute', 'second']).copy() 
    
    test = pd.merge(test, last_obs, on='vesselId', how='left')

    # Rename the columns latitude and longitude to last_latitude and last_longitude
    test.rename(columns={'longitude': 'last_longitude', 'latitude': 'last_latitude'}, inplace=True)    

    # Fix the time column
    test['month'] = test['time'].dt.month
    test['day'] = test['time'].dt.day
    test['hour'] = test['time'].dt.hour
    test['minute'] = test['time'].dt.minute
    test['second'] = test['time'].dt.second
    test.drop('time', axis=1, inplace=True)

    test.drop('scaling_factor', axis=1, inplace=True)
    test.drop('ID', axis=1, inplace=True)

    return test

In [162]:
test = prepare_test_for_predictions(test,features)

In [163]:
# Predict using the Random Forest model
predictions = xgb_model.predict(test)

In [164]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save the predictions to a CSV file
predictions_df.to_csv('predictions_4.csv', index=False, columns=['ID', 'longitude_predicted', 'latitude_predicted'])