# Test 5
Test if we should have both heading and cog, or one of them. 

In [13]:
import pandas as pd
import xgboost as xgb

In [14]:
train = pd.read_csv('ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])

test = pd.read_csv('ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])

vessels = pd.read_csv('vessels.csv', sep='|')

ports = pd.read_csv('ports.csv', sep='|')

In [15]:
# Mappings for IDs
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)

In [16]:
def feature_engineering_heading(test, train):
    test = test.copy()
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude, longitude, heading and sog to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['heading'] = train['heading']
    features['sog'] = train['sog']

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Adding the columns latitude_1 (last seen latitude) and longitude_1 (last senn longitude)
    # to the features from train. For each vessel
    # Also adding time_1 (time difference from current time to last seen time)
    features['latitude_1'] = train.groupby('vesselId')['latitude'].shift(1)
    features['longitude_1'] = train.groupby('vesselId')['longitude'].shift(1)
    features['time_1'] = train.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Adding the columns latitude_2 (second last seen latitude) and longitude_2 (second last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_2 (time difference from current time to second last seen time)
    features['latitude_2'] = train.groupby('vesselId')['latitude'].shift(2)
    features['longitude_2'] = train.groupby('vesselId')['longitude'].shift(2)
    features['time_2'] = train.groupby('vesselId')['time'].diff(2).dt.total_seconds()

    # Adding the columns latitude_3 (third last seen latitude) and longitude_3 (third last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_3 (time difference from current time to third last seen time)
    features['latitude_3'] = train.groupby('vesselId')['latitude'].shift(3)
    features['longitude_3'] = train.groupby('vesselId')['longitude'].shift(3)
    features['time_3'] = train.groupby('vesselId')['time'].diff(3).dt.total_seconds()


    # Dropping the rows with NaN values
    features = features.dropna()

    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop('time', axis=1, inplace=True)

    return features

features_heading = feature_engineering_heading(test, train)
print(features_heading.head())
print(features_heading.shape)


     vesselId  latitude  longitude  heading   sog  under_way  latitude_1  \
439        87  51.46019    2.69617      259   1.1          0    51.45925   
442        21  18.94058  -66.48705      324  12.5          1    18.91427   
443        28  55.46484   14.91138       64  16.7          1    55.44269   
445        82  40.63020   18.28014      316  11.0          1    40.59254   
449        25  45.55134   13.73574       63   0.0          0    45.55135   

     longitude_1  time_1  latitude_2  longitude_2  time_2  latitude_3  \
439      2.69635   357.0    51.45980      2.69436  2342.0    51.45926   
442    -66.46253   600.0    18.85826    -66.41107  1880.0    18.80632   
443     14.84121   588.0    55.39951     14.71121  1680.0    55.33025   
445     18.32469  1008.0    40.56030     18.34426  1708.0    40.50311   
449     13.73573   904.0    45.55136     13.73578  2520.0    45.55133   

     longitude_3  time_3  month  day  hour  minute  second  
439      2.69441  2703.0      1    1     1 

In [21]:
def feature_engineering_both(test, train):
    test = test.copy()
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude, longitude, heading and sog to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['heading'] = train['heading']
    features['cog'] = train['cog']
    features['sog'] = train['sog']

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Adding the columns latitude_1 (last seen latitude) and longitude_1 (last senn longitude)
    # to the features from train. For each vessel
    # Also adding time_1 (time difference from current time to last seen time)
    features['latitude_1'] = train.groupby('vesselId')['latitude'].shift(1)
    features['longitude_1'] = train.groupby('vesselId')['longitude'].shift(1)
    features['time_1'] = train.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Adding the columns latitude_2 (second last seen latitude) and longitude_2 (second last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_2 (time difference from current time to second last seen time)
    features['latitude_2'] = train.groupby('vesselId')['latitude'].shift(2)
    features['longitude_2'] = train.groupby('vesselId')['longitude'].shift(2)
    features['time_2'] = train.groupby('vesselId')['time'].diff(2).dt.total_seconds()

    # Adding the columns latitude_3 (third last seen latitude) and longitude_3 (third last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_3 (time difference from current time to third last seen time)
    features['latitude_3'] = train.groupby('vesselId')['latitude'].shift(3)
    features['longitude_3'] = train.groupby('vesselId')['longitude'].shift(3)
    features['time_3'] = train.groupby('vesselId')['time'].diff(3).dt.total_seconds()


    # Dropping the rows with NaN values
    features = features.dropna()

    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop('time', axis=1, inplace=True)

    return features

features_both = feature_engineering_both(test, train)
print(features_both.head())
print(features_both.shape)


     vesselId  latitude  longitude  heading    cog   sog  under_way  \
439        87  51.46019    2.69617      259  329.1   1.1          0   
442        21  18.94058  -66.48705      324  318.6  12.5          1   
443        28  55.46484   14.91138       64   60.9  16.7          1   
445        82  40.63020   18.28014      316  316.8  11.0          1   
449        25  45.55134   13.73574       63  267.7   0.0          0   

     latitude_1  longitude_1  time_1  ...  longitude_2  time_2  latitude_3  \
439    51.45925      2.69635   357.0  ...      2.69436  2342.0    51.45926   
442    18.91427    -66.46253   600.0  ...    -66.41107  1880.0    18.80632   
443    55.44269     14.84121   588.0  ...     14.71121  1680.0    55.33025   
445    40.59254     18.32469  1008.0  ...     18.34426  1708.0    40.50311   
449    45.55135     13.73573   904.0  ...     13.73578  2520.0    45.55133   

     longitude_3  time_3  month  day  hour  minute  second  
439      2.69441  2703.0      1    1     1 

In [22]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features_heading = find_last_features(features_heading)
print(last_features_heading.head())
print(last_features_heading.shape)

last_features_both = find_last_features(features_both)
print(last_features_both.head())
print(last_features_both.shape)


   vesselId  latitude  longitude  heading   sog  under_way  latitude_1  \
0         0  34.57936  128.99926      219  15.5          1    34.59684   
1         1   1.24460  103.39997      306  15.7          1     1.22186   
2         2  18.13873  -69.74863      284   0.4          1    18.14185   
3         3  41.64055  143.29942       88  14.4          1    41.63254   
4         4  26.58710  121.27831       42  12.7          1    26.54636   

   longitude_1   time_1  latitude_2  longitude_2   time_2  latitude_3  \
0    129.01917    335.0    34.65578    129.09349   1535.0    34.71258   
1    103.46804   1007.0     1.20238    103.55412   2220.0     1.18040   
2    -69.74807   1250.0    18.14458    -69.74653   2470.0    18.14640   
3    141.92751  14489.0    41.63043    141.86587  15126.0    41.62621   
4    121.23948    908.0    26.49491    121.19909   1970.0    26.34699   

   longitude_3   time_3  month  day  hour  minute  second  
0    129.17443   2795.0      5    1    12      41      5

In [23]:
# Define features and target of heading model
y = features_heading[['latitude', 'longitude']]  
X = features_heading.drop(columns=['latitude', 'longitude']) 

In [24]:
# Initialize the xgboost model
model_heading = xgb.XGBRegressor()

# Fit the model
model_heading.fit(X, y)

In [28]:
### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last features
    last_features['year'] = 2024
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute', 'second']]
    )
    last_features =last_features.drop(columns=['longitude_3', 'latitude_3', 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_3']).copy()


    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen features to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))

    # Add the empty columns latitude_3, longitude_3 and time_3 to the prepared test
    prepared_test['latitude_3'] = None
    prepared_test['longitude_3'] = None
    prepared_test['time_3'] = None

    # For each row, latitude_3 should be the same as latitude_2, longitude_3 should be the same as longitude_2
    # and time_3 should be the same as time_2
    prepared_test['latitude_3'] = prepared_test['latitude_2']
    prepared_test['longitude_3'] = prepared_test['longitude_2']
    prepared_test['time_3'] = prepared_test['time_2']

    # Same for latitude_2, longitude_2 and time_2
    prepared_test['latitude_2'] = prepared_test['latitude_1']
    prepared_test['longitude_2'] = prepared_test['longitude_1']
    prepared_test['time_2'] = prepared_test['time_1']

    # Same for latitude_1, longitude_1 and time_1
    prepared_test['latitude_1'] = prepared_test['latitude']
    prepared_test['longitude_1'] = prepared_test['longitude']

    # Time_1 should be the difference between the current time and the last seen time
    prepared_test['time_1'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # For each time column (time_2, time_3), add the rows value in time_1
    prepared_test['time_2'] += prepared_test['time_1']
    prepared_test['time_3'] += prepared_test['time_1']

    # Drop the the latitude and longitude columns
    prepared_test.drop(columns=['latitude', 'longitude'], inplace=True)
    print(prepared_test.shape)

    # Split the time column into month, day, hour, minute and second columns
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute
    prepared_test['second'] = test['time'].dt.second

    prepared_test.drop('time', axis=1, inplace=True)
    prepared_test.drop('time_last', axis=1, inplace=True)

    return prepared_test

test_df = prepare_test_for_predictions(test, last_features_heading)
print(test_df.head())
print(test_df.shape)


(51739, 15)
   vesselId  heading   sog  under_way  latitude_1  longitude_1  time_1  \
0       412      344   0.0          0    31.14647    -81.49789   900.0   
1       373      214   0.0          0    14.81694    120.29625   541.0   
2       181        6  18.7          1    38.27895     10.78280   654.0   
3         8       70   0.1          0   -43.53785    172.83522  1080.0   
4        65      275   0.3          0    48.53320     -6.12003  1258.0   

   latitude_2  longitude_2  time_2  latitude_3  longitude_3   time_3  month  \
0    31.14648    -81.49789  2156.0    31.14648    -81.49789   2880.0      5   
1    14.81694    120.29624  2303.0    14.81688    120.29630   4107.0      5   
2    38.14875     10.75635  2160.0    36.81120     10.29855  31943.0      5   
3   -43.53815    172.83516  1980.0   -43.53800    172.83608   3420.0      5   
4    48.53133     -6.10750  3231.0    48.53133     -6.10695   3269.0      5   

   day  hour  minute  second  
0    8     0       3      16  
1    8

In [26]:
# Make predictions
predictions = model_heading.predict(test_df)
print(predictions)


[[  31.168125  -81.17614 ]
 [  14.219945  117.36327 ]
 [  38.332436   10.950079]
 ...
 [  46.394947 -112.92542 ]
 [  56.920376   24.229898]
 [  52.982185  -30.981703]]


In [27]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_5_heading.csv', index=False)

In [29]:
# Define features and target of heading model
y_both = features_both[['latitude', 'longitude']]  
X_both = features_both.drop(columns=['latitude', 'longitude']) 

In [33]:
# Initialize the xgboost model
model_both = xgb.XGBRegressor()

# Fit the model
model_both.fit(X_both, y_both)

In [34]:
test_df_both = prepare_test_for_predictions(test, last_features_both)
print(test_df_both.head())
print(test_df_both.shape)

(51739, 16)
   vesselId  heading    cog   sog  under_way  latitude_1  longitude_1  time_1  \
0       412      344  179.6   0.0          0    31.14647    -81.49789   900.0   
1       373      214   24.7   0.0          0    14.81694    120.29625   541.0   
2       181        6    8.0  18.7          1    38.27895     10.78280   654.0   
3         8       70  321.3   0.1          0   -43.53785    172.83522  1080.0   
4        65      275  291.0   0.3          0    48.53320     -6.12003  1258.0   

   latitude_2  longitude_2  time_2  latitude_3  longitude_3   time_3  month  \
0    31.14648    -81.49789  2156.0    31.14648    -81.49789   2880.0      5   
1    14.81694    120.29624  2303.0    14.81688    120.29630   4107.0      5   
2    38.14875     10.75635  2160.0    36.81120     10.29855  31943.0      5   
3   -43.53815    172.83516  1980.0   -43.53800    172.83608   3420.0      5   
4    48.53133     -6.10750  3231.0    48.53133     -6.10695   3269.0      5   

   day  hour  minute  seco

In [35]:
# Make predictions
predictions = model_both.predict(test_df_both)
print(predictions)


[[ 31.16654   -81.17161  ]
 [ 14.1361885 117.51841  ]
 [ 38.31263    10.94608  ]
 ...
 [ 40.825203  -95.73736  ]
 [ 59.352516   20.346525 ]
 [ 58.105045    4.246873 ]]


In [36]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_5_both.csv', index=False)