# Adding 3 shifts 

Adding 3 shifts in position (latitude and longitude) and time_since

### Collecting datasets

In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



In [None]:
train = pd.read_csv('data/datasets/ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
train.head()

test = pd.read_csv('data/datasets/ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
test.head()

vessels = pd.read_csv('data/datasets/vessels.csv', sep='|')
vessels.head()

ports = pd.read_csv('data/datasets/ports.csv', sep='|')
ports.head


In [40]:
# Mappings for IDs
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)

In [41]:
train = train.merge(ports[['portId', 'latitude', 'longitude']], how='left', left_on='portId', right_on='portId', suffixes=('', '_port'))
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817


### Feature engineering 

In [42]:
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in nautical miles (1 nautical mile = 1.15078 miles)
    R = 3440.065
    
    # Convert latitude and longitude from degrees to radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    distance = R * c  # Distance in nautical miles
    return distance

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port,favorite_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889,170
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533,77
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889,2
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444,8
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817,2


In [7]:
def feature_engineering(train):
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude, longitude, cog and sog to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['cog'] = train['cog']
    features['sog'] = train['sog']
    

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Adding the columns latitude_1 (last seen latitude) and longitude_1 (last senn longitude)
    # to the features from train. For each vessel
    # Also adding time_1 (time difference from current time to last seen time)
    features['latitude_1'] = train.groupby('vesselId')['latitude'].shift(1)
    features['longitude_1'] = train.groupby('vesselId')['longitude'].shift(1)
    features['time_1'] = train.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Adding the columns latitude_2 (second last seen latitude) and longitude_2 (second last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_2 (time difference from current time to second last seen time)
    features['latitude_2'] = train.groupby('vesselId')['latitude'].shift(2)
    features['longitude_2'] = train.groupby('vesselId')['longitude'].shift(2)
    features['time_2'] = train.groupby('vesselId')['time'].diff(2).dt.total_seconds()

    # Adding the columns latitude_3 (third last seen latitude) and longitude_3 (third last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_3 (time difference from current time to third last seen time)
    features['latitude_3'] = train.groupby('vesselId')['latitude'].shift(3)
    features['longitude_3'] = train.groupby('vesselId')['longitude'].shift(3)
    features['time_3'] = train.groupby('vesselId')['time'].diff(3).dt.total_seconds()

    # Adding the columns latitude_4 (fourth last seen latitude) and longitude_4 (fourth last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_4 (time difference from current time to fourth last seen time)
    features['latitude_4'] = train.groupby('vesselId')['latitude'].shift(4)
    features['longitude_4'] = train.groupby('vesselId')['longitude'].shift(4)
    features['time_4'] = train.groupby('vesselId')['time'].diff(4).dt.total_seconds()

    # Adding the columns latitude_5 (fifth last seen latitude) and longitude_5 (fifth last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_5 (time difference from current time to fifth last seen time)
    features['latitude_5'] = train.groupby('vesselId')['latitude'].shift(5)
    features['longitude_5'] = train.groupby('vesselId')['longitude'].shift(5)
    features['time_5'] = train.groupby('vesselId')['time'].diff(5).dt.total_seconds()

    # Dropping the rows with NaN values
    features = features.dropna()

    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop('time', axis=1, inplace=True)

    return features

features = feature_engineering(train)
print(features.head())
print(features.shape)


NameError: name 'train' is not defined

In [6]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features = find_last_features(features)
print(last_features.head())
print(last_features.shape)

NameError: name 'features' is not defined

### Train the model

In [46]:
# # Define features and target
y = features[['latitude', 'longitude']]  
X = features.drop(columns=['latitude', 'longitude']) 



In [47]:
# Initialize the xgboost model
model = xgb.XGBRegressor()

# Fit the model
model.fit(X, y)

In [48]:
print(X.head())

     vesselId    cog   sog   dist_port  under_way  latitude_1  longitude_1  \
439        87  329.1   1.1   60.878205          0    51.45925      2.69635   
442        21  318.6  12.5   36.388281          1    18.91427    -66.46253   
443        28   60.9  16.7  425.803367          1    55.44269     14.84121   
445        82  316.8  11.0   14.643477          1    40.59254     18.32469   
449        25  267.7   0.0    0.336828          0    45.55135     13.73573   

     time_1  latitude_2  longitude_2  time_2  latitude_3  longitude_3  time_3  \
439   357.0    51.45980      2.69436  2342.0    51.45926      2.69441  2703.0   
442   600.0    18.85826    -66.41107  1880.0    18.80632    -66.36373  3061.0   
443   588.0    55.39951     14.71121  1680.0    55.33025     14.57044  3060.0   
445  1008.0    40.56030     18.34426  1708.0    40.50311     18.37914  2969.0   
449   904.0    45.55136     13.73578  2520.0    45.55133     13.73575  3418.0   

     month  day  hour  minute  second  
439 

In [49]:
### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last features
    last_features['year'] = 2024
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute', 'second']]
    )
    last_features =last_features.drop(columns=['longitude_3', 'latitude_3', 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_3']).copy()


    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen features to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))

    # Add the empty columns latitude_3, longitude_3 and time_3 to the prepared test
    prepared_test['latitude_3'] = None
    prepared_test['longitude_3'] = None
    prepared_test['time_3'] = None

    # For each row, latitude_3 should be the same as latitude_2, longitude_3 should be the same as longitude_2
    # and time_3 should be the same as time_2
    prepared_test['latitude_3'] = prepared_test['latitude_2']
    prepared_test['longitude_3'] = prepared_test['longitude_2']
    prepared_test['time_3'] = prepared_test['time_2']

    # Same for latitude_2, longitude_2 and time_2
    prepared_test['latitude_2'] = prepared_test['latitude_1']
    prepared_test['longitude_2'] = prepared_test['longitude_1']
    prepared_test['time_2'] = prepared_test['time_1']

    # Same for latitude_1, longitude_1 and time_1
    prepared_test['latitude_1'] = prepared_test['latitude']
    prepared_test['longitude_1'] = prepared_test['longitude']

    # Time_1 should be the difference between the current time and the last seen time
    prepared_test['time_1'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # For each time column (time_2, time_3), add the rows value in time_1
    prepared_test['time_2'] += prepared_test['time_1']
    prepared_test['time_3'] += prepared_test['time_1']

    # Drop the the latitude and longitude columns
    prepared_test.drop(columns=['latitude', 'longitude'], inplace=True)
    print(prepared_test.shape)

    # Split the time column into month, day, hour, minute and second columns
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute
    prepared_test['second'] = test['time'].dt.second

    prepared_test.drop('time', axis=1, inplace=True)
    prepared_test.drop('time_last', axis=1, inplace=True)

    return prepared_test

test_df = prepare_test_for_predictions(test, last_features)
print(test_df.head())
print(test_df.shape)


(51739, 16)
   vesselId    cog   sog   dist_port  under_way  latitude_1  longitude_1  \
0       412  179.6   0.0    0.360598          0    31.14647    -81.49789   
1       373   24.7   0.0    1.103930          0    14.81694    120.29625   
2       181    8.0  18.7  233.869216          1    38.27895     10.78280   
3         8  321.3   0.1    6.606219          0   -43.53785    172.83522   
4        65  291.0   0.3   66.171108          0    48.53320     -6.12003   

   time_1  latitude_2  longitude_2  time_2  latitude_3  longitude_3   time_3  \
0   900.0    31.14648    -81.49789  2156.0    31.14648    -81.49789   2880.0   
1   541.0    14.81694    120.29624  2303.0    14.81688    120.29630   4107.0   
2   654.0    38.14875     10.75635  2160.0    36.81120     10.29855  31943.0   
3  1080.0   -43.53815    172.83516  1980.0   -43.53800    172.83608   3420.0   
4  1258.0    48.53133     -6.10750  3231.0    48.53133     -6.10695   3269.0   

   month  day  hour  minute  second  
0      5    

In [50]:
# Make predictions
predictions = model.predict(test_df)
print(predictions)


[[ 31.147839 -81.46959 ]
 [ 13.966731 118.50156 ]
 [ 38.28444   11.024568]
 ...
 [ 49.435986 -96.04315 ]
 [ 51.369675  24.064001]
 [ 60.75761   13.714484]]


In [51]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('data/datasets/predictions_1.csv', index=False)