# Test 7
With ETA and distance to port \\

For some reason, the port longitude and latitude is the same. 

In [34]:
import pandas as pd
import xgboost as xgb

In [35]:
train = pd.read_csv('ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])

test = pd.read_csv('ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])

vessels = pd.read_csv('vessels.csv', sep='|')

ports = pd.read_csv('ports.csv', sep='|')
print(ports.head())

                     portId               name portLocation   longitude  \
0  61d36ed80a1807568ff9a064    Port of Algiers      Algiers    3.067222   
1  61d36ed80a1807568ff9a065     Port of Annaba       Annaba    7.772500   
2  61d36edf0a1807568ff9a070       Port of Oran         Oran   -0.639722   
3  61d36ee00a1807568ff9a072     Port of Skikda       Skikda    6.905833   
4  61d36ee10a1807568ff9a074  Port of Pago-Pago    Pago-Pago -170.690556   

    latitude UN_LOCODE     countryName ISO  
0  36.773611     DZALG         Algeria  DZ  
1  36.900556     DZAAE         Algeria  DZ  
2  35.712222     DZORN         Algeria  DZ  
3  36.887500     DZSKI         Algeria  DZ  
4 -14.274167     ASPPG  American Samoa  AS  


In [36]:
# Mappings for IDs
port_id_mapping = {port_id: idx for idx, port_id in enumerate(ports['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)

In [52]:
import pandas as pd
from datetime import timedelta
import geopy.distance

def merge_with_ports(features, ports):
    features = features.copy()

    # Dropping the irrelevant columns in ports
    ports = ports.drop(['name', 'portLocation', 'UN_LOCODE', 'countryName', 'ISO'], axis=1, errors='ignore')

    # Renaming longitude and latitude columns in ports to portLongitude and portLatitude
    ports = ports.rename(columns={'longitude': 'portLongitude', 'latitude': 'portLatitude'})

    # Drop the rows with NaN values in portId
    features = features.dropna(subset=['portId'])

    # Merging the features with the ports on portId
    features = features.merge(ports, on='portId', how='left')

    # Drop rows with NaN values in latitude and longitude
    features = features.dropna()

    # Calculating the distance between the vessel and the port
    features['distance_to_port'] = features.apply(lambda x: geopy.distance.distance((x['latitude_1'], x['longitude_1']), (x['portLatitude'], x['portLongitude'])).nm, axis=1)


    return features

# Function to convert 'etaRaw' to datetime format, handling year rollovers
def convert_eta_raw(row):
    current_year = row['time'].year  # Get the year from the 'time' column
    eta_with_year = f"{current_year}-{row['etaRaw']}"  # Create full datetime string
    eta_datetime = pd.to_datetime(eta_with_year, format='%Y-%m-%d %H:%M', errors='coerce')
    
    # Adjust year if 'etaRaw' is earlier than 'time'
    if eta_datetime < row['time']:
        eta_datetime = eta_datetime + timedelta(days=365)
    return eta_datetime

def feature_engineering_heading(test, train):
    test = test.copy()
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude, longitude, cog and sog to the features from train
    features['vesselId'] = train['vesselId']
    features['portId'] = train['portId']
    features['time'] = train['time']
    features['etaRaw'] = train['etaRaw']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['cog'] = train['cog']
    features['sog'] = train['sog']

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Adding the columns latitude_1 (last seen latitude) and longitude_1 (last senn longitude)
    # to the features from train. For each vessel
    # Also adding time_1 (time difference from current time to last seen time)
    features['latitude_1'] = train.groupby('vesselId')['latitude'].shift(1)
    features['longitude_1'] = train.groupby('vesselId')['longitude'].shift(1)
    features['time_1'] = train.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Adding the columns latitude_2 (second last seen latitude) and longitude_2 (second last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_2 (time difference from current time to second last seen time)
    features['latitude_2'] = train.groupby('vesselId')['latitude'].shift(2)
    features['longitude_2'] = train.groupby('vesselId')['longitude'].shift(2)
    features['time_2'] = train.groupby('vesselId')['time'].diff(2).dt.total_seconds()

    # Adding the columns latitude_3 (third last seen latitude) and longitude_3 (third last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_3 (time difference from current time to third last seen time)
    features['latitude_3'] = train.groupby('vesselId')['latitude'].shift(3)
    features['longitude_3'] = train.groupby('vesselId')['longitude'].shift(3)
    features['time_3'] = train.groupby('vesselId')['time'].diff(3).dt.total_seconds()

    # Apply the conversion function to create a new 'eta' column
    features['eta'] = features.apply(convert_eta_raw, axis=1)

    # Calculate the time difference between 'time' and 'eta' in seconds
    features['time_difference_eta'] = (features['eta'] - features['time']).dt.total_seconds()

    # Split the time column into month, day, hour, minute, and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute

    # Drop the 'time' and 'etaRaw' columns as they are no longer needed
    features.drop(['time', 'etaRaw'], axis=1, inplace=True)
    features.drop(['eta'], axis=1, inplace=True)

    # Merge the features with the ports
    # Drop rows where portId is null/Nan
    features = features.dropna(subset=['portId'])
    features['portId'] = features['portId'].astype(int)
    features = merge_with_ports(features, ports)

    # Add the feature 'changed_port' to the features.
    # 1 if the vessel has changed port, 0 otherwise
    # features['changed_port'] = features['portId'] != features['portId'].shift(1)

    # Drop the portId column
    # features = features.drop('portId', axis=1)

    # Drop 

    return features

# Example usage
features_heading = feature_engineering_heading(test, train)
print(features_heading.head())
print(features_heading.shape)


     vesselId  portId  latitude  longitude    cog   sog  under_way  \
436        87      32  51.46019    2.69617  329.1   1.1          0   
439        21     427  18.94058  -66.48705  318.6  12.5          1   
440        28     135  55.46484   14.91138   60.9  16.7          1   
442        82     238  40.63020   18.28014  316.8  11.0          1   
446        25     451  45.55134   13.73574  267.7   0.0          0   

     latitude_1  longitude_1  time_1  ...  longitude_3  time_3  \
436    51.45925      2.69635   357.0  ...      2.69441  2703.0   
439    18.91427    -66.46253   600.0  ...    -66.36373  3061.0   
440    55.44269     14.84121   588.0  ...     14.57044  3060.0   
442    40.59254     18.32469  1008.0  ...     18.37914  2969.0   
446    45.55135     13.73573   904.0  ...     13.73575  3418.0   

     time_difference_eta  month  day  hour  minute  portLongitude  \
436           31561153.0      1    1     1       0       4.299722   
439           26303166.0      1    1     1  

In [53]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features_heading = find_last_features(features_heading)
print(last_features_heading.head())
print(last_features_heading.shape)

   vesselId  portId  latitude  longitude    cog   sog  under_way  latitude_1  \
0         0     328  34.57936  128.99926  221.5  15.5          1    34.59684   
1         1     513   1.24460  103.39997  305.1  15.7          1     1.22186   
2         2     124  18.13873  -69.74863  176.0   0.4          1    18.14185   
3         3     292  41.64055  143.29942   87.6  14.4          1    41.63254   
4         4     665  26.58710  121.27831   39.1  12.7          1    26.54636   

   longitude_1   time_1  ...  longitude_3   time_3  time_difference_eta  \
0    129.01917    335.0  ...    129.17443   2795.0              83883.0   
1    103.46804   1007.0  ...    103.64275   3431.0            2310479.0   
2    -69.74807   1250.0  ...    -69.74353   3710.0              39658.0   
3    141.92751  14489.0  ...    141.75177  16296.0            1204235.0   
4    121.23948    908.0  ...    121.12455   4718.0           31319781.0   

   month  day  hour  minute  portLongitude  portLatitude  distance_t

In [54]:
# Define features and target of heading model
y = features_heading[['latitude', 'longitude']]  
X = features_heading.drop(columns=['latitude', 'longitude']) 

In [55]:
# Initialize the xgboost model
model_heading = xgb.XGBRegressor()

# Fit the model
model_heading.fit(X, y)

In [56]:
### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last features
    last_features['year'] = 2024
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute']]
    )
    last_features =last_features.drop(columns=['longitude_3', 'latitude_3', 'year', 'month', 'day', 'hour', 'minute', 'time_3']).copy()


    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen features to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))

    # Add the empty columns latitude_3, longitude_3 and time_3 to the prepared test
    prepared_test['latitude_3'] = None
    prepared_test['longitude_3'] = None
    prepared_test['time_3'] = None

    # For each row, latitude_3 should be the same as latitude_2, longitude_3 should be the same as longitude_2
    # and time_3 should be the same as time_2
    prepared_test['latitude_3'] = prepared_test['latitude_2']
    prepared_test['longitude_3'] = prepared_test['longitude_2']
    prepared_test['time_3'] = prepared_test['time_2']

    # Same for latitude_2, longitude_2 and time_2
    prepared_test['latitude_2'] = prepared_test['latitude_1']
    prepared_test['longitude_2'] = prepared_test['longitude_1']
    prepared_test['time_2'] = prepared_test['time_1']

    # Same for latitude_1, longitude_1 and time_1
    prepared_test['latitude_1'] = prepared_test['latitude']
    prepared_test['longitude_1'] = prepared_test['longitude']

    # Time_1 should be the difference between the current time and the last seen time
    prepared_test['time_1'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # For each time column (time_2, time_3), add the rows value in time_1
    prepared_test['time_2'] += prepared_test['time_1']
    prepared_test['time_3'] += prepared_test['time_1']

    # Subtract time_1 from difference_to_eta
    prepared_test['time_difference_eta'] = (prepared_test['time_difference_eta'] - prepared_test['time_1'])

    # Drop the the latitude and longitude columns
    prepared_test.drop(columns=['latitude', 'longitude'], inplace=True)
    print(prepared_test.shape)

    # Split the time column into month, day, hour, minute and second columns
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute
    # prepared_test['second'] = test['time'].dt.second

    prepared_test.drop('time', axis=1, inplace=True)
    prepared_test.drop('time_last', axis=1, inplace=True)

    return prepared_test

test_df = prepare_test_for_predictions(test, last_features_heading)
print(test_df.head())
print(test_df.shape)


(51739, 20)
   vesselId  portId    cog   sog  under_way  latitude_1  longitude_1  time_1  \
0       412     546  179.6   0.0          0    31.14647    -81.49789   916.0   
1       373     412   24.7   0.0          0    14.81694    120.29625   557.0   
2       181     231    8.0  18.7          1    38.27895     10.78280   662.0   
3         8     378  321.3   0.1          0   -43.53785    172.83522  1114.0   
4        65     141  291.0   0.3          0    48.53320     -6.12003  1287.0   

   latitude_2  longitude_2  ...  portLongitude  portLatitude  \
0    31.14648    -81.49789  ...     -81.496667     31.140556   
1    14.81694    120.29624  ...     120.279444     14.808333   
2    38.14875     10.75635  ...      11.780833     42.098889   
3   -43.53815    172.83516  ...     172.716111    -43.606111   
4    48.53133     -6.10750  ...      -4.474167     48.380556   

   distance_to_port  latitude_3  longitude_3   time_3  month  day  hour  \
0          0.360194    31.14648    -81.49789   

In [57]:
test_df = test_df[['vesselId', 'portId', 'cog', 'sog', 'under_way', 'latitude_1', 'longitude_1', 'time_1', 'latitude_2', 'longitude_2', 'time_2', 'latitude_3', 'longitude_3', 'time_3', 'time_difference_eta', 'month', 'day', 'hour', 'minute', 'portLongitude', 'portLatitude', 'distance_to_port']]

In [58]:
# Make predictions
predictions = model_heading.predict(test_df)
print(predictions)
print(predictions.shape)

[[ 31.309216  -80.85144  ]
 [ 15.068497  119.760345 ]
 [ 38.442757   10.959701 ]
 ...
 [ 41.218758  119.95951  ]
 [ 57.122337   19.295681 ]
 [ 29.33161     3.5562572]]
(51739, 2)


In [59]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_8_portId.csv', index=False)