### Adding homeport from vessel 

In [90]:
import pandas as pd
import xgboost as xgb
import numpy as np


In [91]:
train = pd.read_csv('data/datasets/ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
train.info()
train.head()

test = pd.read_csv('data/datasets/ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
test.head()

vessels = pd.read_csv('data/datasets/vessels.csv', sep='|')
vessels.head()

ports = pd.read_csv('data/datasets/ports.csv', sep='|')
ports.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   time       1522065 non-null  datetime64[ns]
 1   cog        1522065 non-null  float64       
 2   sog        1522065 non-null  float64       
 3   rot        1522065 non-null  int64         
 4   heading    1522065 non-null  int64         
 5   navstat    1522065 non-null  int64         
 6   etaRaw     1522065 non-null  object        
 7   latitude   1522065 non-null  float64       
 8   longitude  1522065 non-null  float64       
 9   vesselId   1522065 non-null  object        
 10  portId     1520450 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 127.7+ MB


Unnamed: 0,portId,name,portLocation,longitude,latitude,UN_LOCODE,countryName,ISO
0,61d36ed80a1807568ff9a064,Port of Algiers,Algiers,3.067222,36.773611,DZALG,Algeria,DZ
1,61d36ed80a1807568ff9a065,Port of Annaba,Annaba,7.7725,36.900556,DZAAE,Algeria,DZ
2,61d36edf0a1807568ff9a070,Port of Oran,Oran,-0.639722,35.712222,DZORN,Algeria,DZ
3,61d36ee00a1807568ff9a072,Port of Skikda,Skikda,6.905833,36.8875,DZSKI,Algeria,DZ
4,61d36ee10a1807568ff9a074,Port of Pago-Pago,Pago-Pago,-170.690556,-14.274167,ASPPG,American Samoa,AS


In [92]:
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)

In [93]:
vessels.head()

Unnamed: 0,shippingLineId,vesselId,CEU,DWT,GT,NT,vesselType,breadth,depth,draft,enginePower,freshWater,fuel,homePort,length,maxHeight,maxSpeed,maxWidth,rampCapacity,yearBuilt
0,0,599.0,6500,21200.0,58684,17606.0,83.0,32.0,22.2,,0.0,,,OSLO,199.0,5.0,18.6,15.2,150.0,2000
1,1,65.0,4902,12325.0,46800,,83.0,31.0,,,14220.0,,,MONROVIA,182.0,,,,,2006
2,2,640.0,5000,13059.0,46800,,83.0,31.0,,,14220.0,,,SAINT JOHN'S,182.0,,,,,2010
3,3,255.0,4200,12588.0,39362,,83.0,28.0,,,11060.0,,,,167.0,,,,,2011
4,4,68.0,7450,21052.0,75528,24391.0,83.0,37.2,22.23,,13140.0,491.47,3236.78,Panama,199.98,,,,,2018


In [94]:
train = train.merge(ports[['portId', 'latitude', 'longitude']], how='left', left_on='portId', right_on='portId', suffixes=('', '_port'))
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817


In [95]:


# Count the frequency of each port visit for each vessel
port_visits = train.groupby(['vesselId', 'portId']).size().reset_index(name='visit_count')

# Sort by vesselId and visit_count to see the most visited ports
sorted_port_visits = port_visits.sort_values(by=['vesselId', 'visit_count'], ascending=[True, False])

favorite_ports = sorted_port_visits.groupby('vesselId').first().reset_index()
favorite_ports = favorite_ports[['vesselId', 'portId']]
favorite_ports.columns = ['vesselId', 'favorite_port']

# Merge favorite_ports with train based on vesselId
train = train.merge(favorite_ports, on='vesselId', how='left')
# Display the result
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port,favorite_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889,170
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533,77
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889,2
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444,8
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817,2


In [96]:
train = train.merge(vessels[['vesselId', 'length', 'shippingLineId']], on='vesselId', how='left')
train['vessel_deep_sea'] = np.where(train['length'] > 200, 1, 0)
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port,favorite_port,length,shippingLineId,vessel_deep_sea
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889,170,199.0,9,0
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533,77,199.97,6,0
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889,2,199.0,14,0
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444,8,199.0,5,0
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817,2,199.95,6,0


In [97]:
# def haversine(lat1, lon1, lat2, lon2):
#     # Convert latitude and longitude from degrees to radians
#     lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

#     # Haversine formula
#     dlon = lon2 - lon1 
#     dlat = lat2 - lat1 
#     a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
#     c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
#     r = 6371  # Radius of Earth in kilometers
#     return r * c  # Distance in kilometers

### Feature engineering 

In [98]:
def feature_engineering(train):
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude, longitude, cog and sog to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    # features['latitude_port'] = train['latitude_port']
    # features['longitude_port'] = train['longitude_port']    
    features['cog'] = train['cog']
    # features['sog'] = train['sog']

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Adding the columns latitude_1 (last seen latitude) and longitude_1 (last senn longitude)
    # to the features from train. For each vessel
    # Also adding time_1 (time difference from current time to last seen time)
    features['latitude_1'] = train.groupby('vesselId')['latitude'].shift(1)
    features['longitude_1'] = train.groupby('vesselId')['longitude'].shift(1)
    features['time_1'] = train.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Adding the columns latitude_2 (second last seen latitude) and longitude_2 (second last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_2 (time difference from current time to second last seen time)
    features['latitude_2'] = train.groupby('vesselId')['latitude'].shift(2)
    features['longitude_2'] = train.groupby('vesselId')['longitude'].shift(2)
    features['time_2'] = train.groupby('vesselId')['time'].diff(2).dt.total_seconds()

    # Adding the columns latitude_3 (third last seen latitude) and longitude_3 (third last seen longitude)
    # to the features from train. For each vessel
    # Also adding time_3 (time difference from current time to third last seen time)
    features['latitude_3'] = train.groupby('vesselId')['latitude'].shift(3)
    features['longitude_3'] = train.groupby('vesselId')['longitude'].shift(3)
    features['time_3'] = train.groupby('vesselId')['time'].diff(3).dt.total_seconds()


    # Dropping the rows with NaN values
    features = features.dropna()

    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    # features['favorite_port'] = train['favorite_port']  
    # features['vessel_deep_sea'] = train['vessel_deep_sea']
    # features['shippingLineId'] = train['shippingLineId']
    features['length'] = train['length']
    # features['portId'] = train['portId']
    
    # features['distance_to_port'] = haversine(features['latitude'], features['longitude'], features['latitude_port'], features['longitude_port'])
    # # Group by vesselId and time to ensure the features are calculated correctly
    # features = features.groupby(['vesselId', 'time']).first().reset_index()
    # features['rolling_distance_mean'] = features.groupby('vesselId')['distance_to_port'].transform(lambda x: x.rolling(window=6, min_periods=1).mean())
    # features['rolling_distance_sum'] = features.groupby('vesselId')['distance_to_port'].transform(lambda x: x.rolling(window=6, min_periods=1).sum())

    

    # # Create a flag if the rolling distance has been increasing over the last 6 values
    # features['rolling_distance_increasing'] = features.groupby('vesselId')['rolling_distance_mean'].transform(lambda x: x.rolling(window=6).apply(lambda y: all(y[i] < y[i+1] for i in range(len(y)-1)), raw=True)).fillna(0).astype(int)
    # # features.drop('time', axis=1, inplace=True)
    # # # Create rolling window features for distance to port


    features.drop('time', axis=1, inplace=True)

    return features

features = feature_engineering(train)
print(features.head())
print(features.shape)


     vesselId  latitude  longitude    cog  under_way  latitude_1  longitude_1  \
439        87  51.46019    2.69617  329.1          0    51.45925      2.69635   
442        21  18.94058  -66.48705  318.6          1    18.91427    -66.46253   
443        28  55.46484   14.91138   60.9          1    55.44269     14.84121   
445        82  40.63020   18.28014  316.8          1    40.59254     18.32469   
449        25  45.55134   13.73574  267.7          0    45.55135     13.73573   

     time_1  latitude_2  longitude_2  time_2  latitude_3  longitude_3  time_3  \
439   357.0    51.45980      2.69436  2342.0    51.45926      2.69441  2703.0   
442   600.0    18.85826    -66.41107  1880.0    18.80632    -66.36373  3061.0   
443   588.0    55.39951     14.71121  1680.0    55.33025     14.57044  3060.0   
445  1008.0    40.56030     18.34426  1708.0    40.50311     18.37914  2969.0   
449   904.0    45.55136     13.73578  2520.0    45.55133     13.73575  3418.0   

     month  day  hour  min

In [99]:
features.head()

Unnamed: 0,vesselId,latitude,longitude,cog,under_way,latitude_1,longitude_1,time_1,latitude_2,longitude_2,time_2,latitude_3,longitude_3,time_3,month,day,hour,minute,second,length
439,87,51.46019,2.69617,329.1,0,51.45925,2.69635,357.0,51.4598,2.69436,2342.0,51.45926,2.69441,2703.0,1,1,1,0,47,211.0
442,21,18.94058,-66.48705,318.6,1,18.91427,-66.46253,600.0,18.85826,-66.41107,1880.0,18.80632,-66.36373,3061.0,1,1,1,3,54,182.0
443,28,55.46484,14.91138,60.9,1,55.44269,14.84121,588.0,55.39951,14.71121,1680.0,55.33025,14.57044,3060.0,1,1,1,4,33,187.0
445,82,40.6302,18.28014,316.8,1,40.59254,18.32469,1008.0,40.5603,18.34426,1708.0,40.50311,18.37914,2969.0,1,1,1,5,3,169.0
449,25,45.55134,13.73574,267.7,0,45.55135,13.73573,904.0,45.55136,13.73578,2520.0,45.55133,13.73575,3418.0,1,1,1,10,22,169.0


In [100]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features = find_last_features(features)
print(last_features.head())
print(last_features.shape)

   vesselId  latitude  longitude    cog  under_way  latitude_1  longitude_1  \
0         0  34.57936  128.99926  221.5          1    34.59684    129.01917   
1         1   1.24460  103.39997  305.1          1     1.22186    103.46804   
2         2  18.13873  -69.74863  176.0          1    18.14185    -69.74807   
3         3  41.64055  143.29942   87.6          1    41.63254    141.92751   
4         4  26.58710  121.27831   39.1          1    26.54636    121.23948   

    time_1  latitude_2  longitude_2   time_2  latitude_3  longitude_3  \
0    335.0    34.65578    129.09349   1535.0    34.71258    129.17443   
1   1007.0     1.20238    103.55412   2220.0     1.18040    103.64275   
2   1250.0    18.14458    -69.74653   2470.0    18.14640    -69.74353   
3  14489.0    41.63043    141.86587  15126.0    41.62621    141.75177   
4    908.0    26.49491    121.19909   1970.0    26.34699    121.12455   

    time_3  month  day  hour  minute  second  length  
0   2795.0      5    1    12   

### Train the model

In [101]:
# # Define features and target
y = features[['latitude', 'longitude']]  
X = features.drop(columns=['latitude', 'longitude']) 

X.head()

Unnamed: 0,vesselId,cog,under_way,latitude_1,longitude_1,time_1,latitude_2,longitude_2,time_2,latitude_3,longitude_3,time_3,month,day,hour,minute,second,length
439,87,329.1,0,51.45925,2.69635,357.0,51.4598,2.69436,2342.0,51.45926,2.69441,2703.0,1,1,1,0,47,211.0
442,21,318.6,1,18.91427,-66.46253,600.0,18.85826,-66.41107,1880.0,18.80632,-66.36373,3061.0,1,1,1,3,54,182.0
443,28,60.9,1,55.44269,14.84121,588.0,55.39951,14.71121,1680.0,55.33025,14.57044,3060.0,1,1,1,4,33,187.0
445,82,316.8,1,40.59254,18.32469,1008.0,40.5603,18.34426,1708.0,40.50311,18.37914,2969.0,1,1,1,5,3,169.0
449,25,267.7,0,45.55135,13.73573,904.0,45.55136,13.73578,2520.0,45.55133,13.73575,3418.0,1,1,1,10,22,169.0


In [102]:
# Initialize the xgboost model
model = xgb.XGBRegressor()

# Fit the model
model.fit(X, y)

In [103]:
X.head()

Unnamed: 0,vesselId,cog,under_way,latitude_1,longitude_1,time_1,latitude_2,longitude_2,time_2,latitude_3,longitude_3,time_3,month,day,hour,minute,second,length
439,87,329.1,0,51.45925,2.69635,357.0,51.4598,2.69436,2342.0,51.45926,2.69441,2703.0,1,1,1,0,47,211.0
442,21,318.6,1,18.91427,-66.46253,600.0,18.85826,-66.41107,1880.0,18.80632,-66.36373,3061.0,1,1,1,3,54,182.0
443,28,60.9,1,55.44269,14.84121,588.0,55.39951,14.71121,1680.0,55.33025,14.57044,3060.0,1,1,1,4,33,187.0
445,82,316.8,1,40.59254,18.32469,1008.0,40.5603,18.34426,1708.0,40.50311,18.37914,2969.0,1,1,1,5,3,169.0
449,25,267.7,0,45.55135,13.73573,904.0,45.55136,13.73578,2520.0,45.55133,13.73575,3418.0,1,1,1,10,22,169.0


In [104]:
### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last test
    last_features['year'] = 2024
    print(last_features.columns)
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute', 'second']]
    )
    last_features =last_features.drop(columns=['longitude_3', 'latitude_3', 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_3']).copy()
    print(last_features.columns)

    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))
    print(prepared_test)
    print(test.columns)

    # Add the empty columns latitude_3, longitude_3 and time_3 to the prepared test
    prepared_test['latitude_3'] = None
    prepared_test['longitude_3'] = None
    prepared_test['time_3'] = None

    # For each row, latitude_3 should be the same as latitude_2, longitude_3 should be the same as longitude_2
    # and time_3 should be the same as time_2
    prepared_test['latitude_3'] = prepared_test['latitude_2']
    prepared_test['longitude_3'] = prepared_test['longitude_2']
    prepared_test['time_3'] = prepared_test['time_2']

    # Same for latitude_2, longitude_2 and time_2
    prepared_test['latitude_2'] = prepared_test['latitude_1']
    prepared_test['longitude_2'] = prepared_test['longitude_1']
    prepared_test['time_2'] = prepared_test['time_1']

    # Same for latitude_1, longitude_1 and time_1
    prepared_test['latitude_1'] = prepared_test['latitude']
    prepared_test['longitude_1'] = prepared_test['longitude']

    # Time_1 should be the difference between the current time and the last seen time
    prepared_test['time_1'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # For each time column (time_2, time_3), add the rows value in time_1
    prepared_test['time_2'] += prepared_test['time_1']
    prepared_test['time_3'] += prepared_test['time_1']

    # Drop the the latitude and longitude columns
    prepared_test.drop(columns=['latitude', 'longitude'], inplace=True)
    print(prepared_test.shape)

    # Split the time column into month, day, hour, minute and second columns
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute
    prepared_test['second'] = test['time'].dt.second
    

    prepared_test.drop('time', axis=1, inplace=True)
    prepared_test.drop('time_last', axis=1, inplace=True)

    return prepared_test


# Prepare test data for predictions
test_df = prepare_test_for_predictions(test, last_features)

test_df


Index(['vesselId', 'latitude', 'longitude', 'cog', 'under_way', 'latitude_1',
       'longitude_1', 'time_1', 'latitude_2', 'longitude_2', 'time_2',
       'latitude_3', 'longitude_3', 'time_3', 'month', 'day', 'hour', 'minute',
       'second', 'length', 'year'],
      dtype='object')
Index(['vesselId', 'latitude', 'longitude', 'cog', 'under_way', 'latitude_1',
       'longitude_1', 'time_1', 'latitude_2', 'longitude_2', 'time_2',
       'length', 'time'],
      dtype='object')
       vesselId                time  latitude  longitude    cog  under_way  \
0           412 2024-05-08 00:03:16  31.14647  -81.49789  179.6          0   
1           373 2024-05-08 00:06:17  14.81694  120.29625   24.7          0   
2           181 2024-05-08 00:10:02  38.27895   10.78280    8.0          1   
3             8 2024-05-08 00:10:34 -43.53785  172.83522  321.3          0   
4            65 2024-05-08 00:12:27  48.53320   -6.12003  291.0          0   
...         ...                 ...       ...   

Unnamed: 0,vesselId,cog,under_way,latitude_1,longitude_1,time_1,latitude_2,longitude_2,time_2,length,latitude_3,longitude_3,time_3,month,day,hour,minute,second
0,412,179.6,0,31.14647,-81.49789,900.0,31.14648,-81.49789,2156.0,230.00,31.14648,-81.49789,2880.0,5,8,0,3,16
1,373,24.7,0,14.81694,120.29625,541.0,14.81694,120.29624,2303.0,124.00,14.81688,120.29630,4107.0,5,8,0,6,17
2,181,8.0,1,38.27895,10.78280,654.0,38.14875,10.75635,2160.0,186.00,36.81120,10.29855,31943.0,5,8,0,10,2
3,8,321.3,0,-43.53785,172.83522,1080.0,-43.53815,172.83516,1980.0,183.00,-43.53800,172.83608,3420.0,5,8,0,10,34
4,65,291.0,0,48.53320,-6.12003,1258.0,48.53133,-6.10750,3231.0,182.00,48.53133,-6.10695,3269.0,5,8,0,12,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51734,244,5.7,1,26.22892,-79.40341,1304350.0,26.15166,-79.41257,1305424.0,199.00,26.13810,-79.41431,1305611.0,5,12,23,59,58
51735,3,87.6,1,41.64055,143.29942,1420233.0,41.63254,141.92751,1434722.0,199.00,41.63043,141.86587,1435359.0,5,12,23,59,58
51736,30,76.9,1,35.33234,142.69115,1226550.0,35.30791,142.56395,1228002.0,199.97,35.29223,142.47785,1228975.0,5,12,23,59,58
51737,36,232.0,1,59.26571,21.98971,448658.0,59.41452,22.36688,451130.0,218.00,59.45172,22.46184,451730.0,5,12,23,59,58


In [105]:
expected_feature_names = model.get_booster().feature_names

# Ensure the DataFrame has all the required columns and in the correct order
test_df = test_df[expected_feature_names]

# Print the final DataFrame structure to ensure it's correct
print("Final test_df structure:")
# test_df = test_df.drop(columns=['vesselId'], errors='ignore')
print(test_df.head())

predictions = model.predict(test_df)

# Display predictions
print(predictions)

Final test_df structure:
   vesselId    cog  under_way  latitude_1  longitude_1  time_1  latitude_2  \
0       412  179.6          0    31.14647    -81.49789   900.0    31.14648   
1       373   24.7          0    14.81694    120.29625   541.0    14.81694   
2       181    8.0          1    38.27895     10.78280   654.0    38.14875   
3         8  321.3          0   -43.53785    172.83522  1080.0   -43.53815   
4        65  291.0          0    48.53320     -6.12003  1258.0    48.53133   

   longitude_2  time_2  latitude_3  longitude_3   time_3  month  day  hour  \
0    -81.49789  2156.0    31.14648    -81.49789   2880.0      5    8     0   
1    120.29624  2303.0    14.81688    120.29630   4107.0      5    8     0   
2     10.75635  2160.0    36.81120     10.29855  31943.0      5    8     0   
3    172.83516  1980.0   -43.53800    172.83608   3420.0      5    8     0   
4     -6.10750  3231.0    48.53133     -6.10695   3269.0      5    8     0   

   minute  second  length  
0       3

In [106]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
# predictions_df.to_csv('data/submissions/predictions_5.csv', index=False)
predictions_df.to_csv('data/submissions/predictions_6.csv', index=False)