What is planned to be done in this notebook: 

- Maybe redefine longitude and latitude to remove shifts from 0 to 160 when moving between the edge case longitudes and latitudes.

-  add lag of 3 

-  Create features for “Under way” and “Not under way”
    - Under way: navstat 0 and 8 
    - Not under way: navstat 1 and 5


- Add cartesian coordinates instead of lat and lon 
  - need to remember that they have been transformed later on 

- Add coordinates based on portid, possibly detect if port is in the right direction 

In [205]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split


In [206]:
train = pd.read_csv('data/datasets/ais_train.csv', sep='|')
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


Redefine longitude and latitude to remove shifts from 0 to 160 when moving between the edge case longitudes and latitudes.
   - range will now be between 0-360

In [207]:
def redefine_coordinates(df):
    df['longitude'] = df['longitude'].apply(lambda x: x if x >= 0 else x + 360)
    df['latitude'] = df['latitude'].apply(lambda x: x if x >= 0 else x + 180)
    return df

train = redefine_coordinates(train)
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,145.2563,302.1487,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,280.52061,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,283.52433,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,145.58811,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,354.08364,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


In [208]:
def missing_heading(df):
    # Step 1: Replace 511 with NaN in 'heading' column to mark missing values
    df['heading'] = df['heading'].replace(511, np.nan)

    
    return df

# Assuming 'train' DataFrame has been loaded
train = missing_heading(train)
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88.0,0,01-09 23:00,145.2563,302.1487,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347.0,1,12-29 20:00,8.8944,280.52061,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112.0,0,01-02 09:00,39.19065,283.52433,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142.0,1,12-31 20:00,145.58811,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215.0,0,01-25 12:00,35.88379,354.08364,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


In [209]:
def change_time_attr(df):

    df['time'] = pd.to_datetime(df['time'])

    df['month'] = train['time'].dt.month
    df['day'] = train['time'].dt.day
    df['hour'] = train['time'].dt.hour
    df['minute'] = train['time'].dt.minute
    df['second'] = train['time'].dt.second

    return df

change_time_attr(train)
print(train.head())

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-01 00:00:25  284.0   0.7    0     88.0        0  01-09 23:00   
1 2024-01-01 00:00:36  109.6   0.0   -6    347.0        1  12-29 20:00   
2 2024-01-01 00:01:45  111.0  11.0    0    112.0        0  01-02 09:00   
3 2024-01-01 00:03:11   96.4   0.0    0    142.0        1  12-31 20:00   
4 2024-01-01 00:03:51  214.0  19.7    0    215.0        0  01-25 12:00   

    latitude  longitude                  vesselId                    portId  \
0  145.25630  302.14870  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
1    8.89440  280.52061  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689   
2   39.19065  283.52433  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19   
3  145.58811  151.02067  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126   
4   35.88379  354.08364  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3   

   month  day  hour  minute  second  
0      1    1     0       0      25  
1   

Add lag of 3 - change time feature 
   - each row will now contain information about where the vessel has been the last hour 

In [210]:
def add_lat_lon_lag(df, lag_steps=3):
    # Ensure the DataFrame is sorted by 'vesselId' and 'time'
    df = df.sort_values(by=['vesselId', 'time'])

    # Create lagged features for 'latitude' and 'longitude' within each vesselId group
    for vessel_id, group in df.groupby('vesselId'):
        for lag in range(1, lag_steps + 1):
            df.loc[group.index, f'latitude_lag_{lag}'] = group['latitude'].shift(lag)
            df.loc[group.index, f'longitude_lag_{lag}'] = group['longitude'].shift(lag)

    return df

train = add_lat_lon_lag(train)
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,day,hour,minute,second,latitude_lag_1,longitude_lag_1,latitude_lag_2,longitude_lag_2,latitude_lag_3,longitude_lag_3
131115,2024-01-12 14:07:47,308.1,17.1,-6,316.0,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,12,14,7,47,,,,,,
131279,2024-01-12 14:31:00,307.6,17.3,5,313.0,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,12,14,31,0,7.50361,77.5834,,,,
131514,2024-01-12 14:57:23,306.8,16.9,5,312.0,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,12,14,57,23,7.57302,77.49505,7.50361,77.5834,,
131696,2024-01-12 15:18:48,307.9,16.9,6,313.0,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,12,15,18,48,7.65043,77.39404,7.57302,77.49505,7.50361,77.5834
131885,2024-01-12 15:39:47,307.0,16.3,7,313.0,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,12,15,39,47,7.71275,77.31394,7.65043,77.39404,7.57302,77.49505


In [211]:
def under_way(df): 

    df['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    return df 

under_way(train)
print(train)

                       time    cog   sog  rot  heading  navstat       etaRaw  \
131115  2024-01-12 14:07:47  308.1  17.1   -6    316.0        0  01-08 06:00   
131279  2024-01-12 14:31:00  307.6  17.3    5    313.0        0  01-14 23:30   
131514  2024-01-12 14:57:23  306.8  16.9    5    312.0        0  01-14 23:30   
131696  2024-01-12 15:18:48  307.9  16.9    6    313.0        0  01-14 23:30   
131885  2024-01-12 15:39:47  307.0  16.3    7    313.0        0  01-14 23:30   
...                     ...    ...   ...  ...      ...      ...          ...   
1521244 2024-05-07 22:36:16  324.1  13.5   -2    325.0        0  05-08 03:00   
1521409 2024-05-07 22:57:05  324.2  13.3   -3    326.0        0  05-08 03:00   
1521625 2024-05-07 23:17:54  356.5  12.2   -1    354.0        0  05-08 03:00   
1521821 2024-05-07 23:38:13   52.6  17.3    3     50.0        0  05-08 03:00   
1522014 2024-05-07 23:59:01   53.6  17.7   -1     51.0        0  05-08 03:00   

         latitude  longitude           

In [212]:
ports = pd.read_csv('data/datasets/ports.csv', sep='|')
ports.head()

Unnamed: 0,portId,name,portLocation,longitude,latitude,UN_LOCODE,countryName,ISO
0,61d36ed80a1807568ff9a064,Port of Algiers,Algiers,3.067222,36.773611,DZALG,Algeria,DZ
1,61d36ed80a1807568ff9a065,Port of Annaba,Annaba,7.7725,36.900556,DZAAE,Algeria,DZ
2,61d36edf0a1807568ff9a070,Port of Oran,Oran,-0.639722,35.712222,DZORN,Algeria,DZ
3,61d36ee00a1807568ff9a072,Port of Skikda,Skikda,6.905833,36.8875,DZSKI,Algeria,DZ
4,61d36ee10a1807568ff9a074,Port of Pago-Pago,Pago-Pago,-170.690556,-14.274167,ASPPG,American Samoa,AS


In [213]:

train = train.merge(ports[['portId', 'latitude', 'longitude']], how='left', left_on='portId', right_on='portId', suffixes=('', '_port'))
print(train.head())

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-12 14:07:47  308.1  17.1   -6    316.0        0  01-08 06:00   
1 2024-01-12 14:31:00  307.6  17.3    5    313.0        0  01-14 23:30   
2 2024-01-12 14:57:23  306.8  16.9    5    312.0        0  01-14 23:30   
3 2024-01-12 15:18:48  307.9  16.9    6    313.0        0  01-14 23:30   
4 2024-01-12 15:39:47  307.0  16.3    7    313.0        0  01-14 23:30   

   latitude  longitude                  vesselId  ... second  latitude_lag_1  \
0   7.50361   77.58340  61e9f38eb937134a3c4bfd8b  ...     47             NaN   
1   7.57302   77.49505  61e9f38eb937134a3c4bfd8b  ...      0         7.50361   
2   7.65043   77.39404  61e9f38eb937134a3c4bfd8b  ...     23         7.57302   
3   7.71275   77.31394  61e9f38eb937134a3c4bfd8b  ...     48         7.65043   
4   7.77191   77.23585  61e9f38eb937134a3c4bfd8b  ...     47         7.71275   

   longitude_lag_1  latitude_lag_2  longitude_lag_2  latitude_lag_3  \
0  

In [214]:
def calculate_bearing(lat1, lon1, lat2, lon2):
    # Convert lat/lon to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlon = lon2 - lon1
    x = np.sin(dlon) * np.cos(lat2)
    y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
    
    bearing = np.degrees(np.arctan2(x, y))
    return (bearing + 360) % 360  # Normalize bearing to 0-360 degrees

def add_bearing_to_port(df):
    # Calculate the bearing from the ship's current location to the port's location
    df['bearing_to_port'] = df.apply(lambda row: calculate_bearing(row['latitude'], row['longitude'], row['latitude_port'], row['longitude_port']), axis=1)
    
    # Optionally, compare with the ship's current heading if available
    df['correct_direction'] = df['heading'] - df['bearing_to_port']  # Assuming 'heading' column exists

    return df

train = add_bearing_to_port(train)
train.head()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,longitude_lag_1,latitude_lag_2,longitude_lag_2,latitude_lag_3,longitude_lag_3,under_way,latitude_port,longitude_port,bearing_to_port,correct_direction
0,2024-01-12 14:07:47,308.1,17.1,-6,316.0,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,,,,,,1,13.263333,80.341111,24.982842,291.017158
1,2024-01-12 14:31:00,307.6,17.3,5,313.0,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,77.5834,,,,,1,18.941944,72.885278,338.951334,-25.951334
2,2024-01-12 14:57:23,306.8,16.9,5,312.0,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,77.49505,7.50361,77.5834,,,1,18.941944,72.885278,339.243715,-27.243715
3,2024-01-12 15:18:48,307.9,16.9,6,313.0,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,77.39404,7.57302,77.49505,7.50361,77.5834,1,18.941944,72.885278,339.477726,-26.477726
4,2024-01-12 15:39:47,307.0,16.3,7,313.0,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,77.31394,7.65043,77.39404,7.57302,77.49505,1,18.941944,72.885278,339.711702,-26.711702


In [215]:
def lat_lon_to_cartesian(df):
    # Assuming latitudes and longitudes are in degrees
    df['latitude_rad'] = np.radians(df['latitude'])
    df['longitude_rad'] = np.radians(df['longitude'])

    # Earth's radius in kilometers (approx.)
    R = 6371.0

    # Convert to Cartesian coordinates
    df['x'] = R * np.cos(df['latitude_rad']) * np.cos(df['longitude_rad'])
    df['y'] = R * np.cos(df['latitude_rad']) * np.sin(df['longitude_rad'])
    df['z'] = R * np.sin(df['latitude_rad'])

    df = df.drop(columns=['latitude', 'longitude'])

    return df

train = lat_lon_to_cartesian(train)
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,vesselId,portId,month,...,under_way,latitude_port,longitude_port,bearing_to_port,correct_direction,latitude_rad,longitude_rad,x,y,z
0,2024-01-12 14:07:47,308.1,17.1,-6,316.0,0,01-08 06:00,61e9f38eb937134a3c4bfd8b,61d376b393c6feb83e5eb50c,1,...,1,13.263333,80.341111,24.982842,291.017158,0.130963,1.354086,1358.150694,6168.701354,831.980349
1,2024-01-12 14:31:00,307.6,17.3,5,313.0,0,01-14 23:30,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1,...,1,18.941944,72.885278,338.951334,-25.951334,0.132174,1.352544,1367.441968,6165.611251,839.631684
2,2024-01-12 14:57:23,306.8,16.9,5,312.0,0,01-14 23:30,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1,...,1,18.941944,72.885278,339.243715,-27.243715,0.133525,1.350781,1378.060712,6162.078258,848.163436
3,2024-01-12 15:18:48,307.9,16.9,6,313.0,0,01-14 23:30,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1,...,1,18.941944,72.885278,339.477726,-26.477726,0.134613,1.349383,1386.470584,6159.242035,855.030918
4,2024-01-12 15:39:47,307.0,16.3,7,313.0,0,01-14 23:30,61e9f38eb937134a3c4bfd8b,61d376d893c6feb83e5eb546,1,...,1,18.941944,72.885278,339.711702,-26.711702,0.135645,1.34802,1394.668095,6156.48234,861.549242


In [216]:
def time_since_seen(df):
    # Ensure time is a datetime object
    df['time'] = pd.to_datetime(df['time'], errors='coerce')

    # Calculate the time difference in seconds
    df['time_since_last_seen'] = df.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Fill NaN values with 0 or a suitable value (e.g., the last known timestamp, etc.)
    df['time_since_last_seen'].fillna(0, inplace=True)  # or you could choose to fill with some other value

    return df


def factorize_ids(df):
    unique_vessels = df['vesselId'].unique()
    vesselID_mapping = {vessel: idx for idx, vessel in enumerate(unique_vessels)}
    df['vesselId'] = df['vesselId'].map(vesselID_mapping)

    # Create mappings for portId
    unique_ports = df['portId'].unique()
    portID_mapping = {port: idx for idx, port in enumerate(unique_ports)}
    df['portId'] = df['portId'].map(portID_mapping)

    # Reset the index
    df.reset_index(drop=True, inplace=True)
   
    return df, vesselID_mapping, portID_mapping



In [217]:
def add_features(df):

    df = missing_heading(df)
    df = change_time_attr(df)
    df = add_lat_lon_lag(df)
    df = under_way(df)
    df = df.merge(ports[['portId', 'latitude', 'longitude']], how='left', on='portId', suffixes=('', '_port'))
    df = add_bearing_to_port(df)
    df = time_since_seen(df)
    df, vesselID_mapping, portID_mapping = factorize_ids(df)  # Add factorization here

    return df, vesselID_mapping, portID_mapping

    return df

train2 = pd.read_csv('data/datasets/ais_train.csv', sep='|')
train2, vesselID_mapping, portID_mapping = add_features(train2)
print(train2.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_seen'].fillna(0, inplace=True)  # or you could choose to fill with some other value


                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-12 14:07:47  308.1  17.1   -6    316.0        0  01-08 06:00   
1 2024-01-12 14:31:00  307.6  17.3    5    313.0        0  01-14 23:30   
2 2024-01-12 14:57:23  306.8  16.9    5    312.0        0  01-14 23:30   
3 2024-01-12 15:18:48  307.9  16.9    6    313.0        0  01-14 23:30   
4 2024-01-12 15:39:47  307.0  16.3    7    313.0        0  01-14 23:30   

   latitude  longitude  vesselId  ...  latitude_lag_2  longitude_lag_2  \
0   7.50361   77.58340         0  ...             NaN              NaN   
1   7.57302   77.49505         0  ...             NaN              NaN   
2   7.65043   77.39404         0  ...         7.50361         77.58340   
3   7.71275   77.31394         0  ...         7.57302         77.49505   
4   7.77191   77.23585         0  ...         7.65043         77.39404   

   latitude_lag_3  longitude_lag_3  under_way  latitude_port  longitude_port  \
0             NaN             

In [218]:
X = train2.drop(columns=['latitude', 'longitude', 'time', 'etaRaw', 'heading'])  # Keep 'vesselId' in the features
y_lat = train2['latitude']  # Target for latitude
y_lon = train2['longitude']  # Target for longitude

# Splitting the dataset
X_train, X_test, y_train_lat, y_test_lat = train_test_split(X, y_lat, test_size=0.2, random_state=42)
X_train, X_test, y_train_lon, y_test_lon = train_test_split(X, y_lon, test_size=0.2, random_state=42)

# Drop rows with NaN values in the lagged features
train2 = train2.dropna(subset=[f'latitude_lag_{lag}' for lag in range(1, 3 + 1)] +
                     [f'longitude_lag_{lag}' for lag in range(1, 3 + 1)])

print(X_train.shape)
print(y_train_lon.shape)


# Training the model for latitude
lat_model = xgb.XGBRegressor(random_state=42)
lat_model.fit(X_train, y_train_lat)

# Training the model for longitude
lon_model = xgb.XGBRegressor(random_state=42)
lon_model.fit(X_train, y_train_lon)


(1217652, 23)
(1217652,)


In [219]:
def last_observed(df):
    last_obs = df.groupby('vesselId').last().reset_index()
    return last_obs

print(train2)

                       time    cog   sog  rot  heading  navstat       etaRaw  \
3       2024-01-12 15:18:48  307.9  16.9    6    313.0        0  01-14 23:30   
4       2024-01-12 15:39:47  307.0  16.3    7    313.0        0  01-14 23:30   
5       2024-01-12 15:54:48  307.6  16.1    5    313.0        0  01-14 23:30   
6       2024-01-12 16:14:59  309.5  16.1   -6    313.0        0  01-14 23:30   
7       2024-01-12 16:35:24  308.7  16.0    2    311.0        0  01-14 23:30   
...                     ...    ...   ...  ...      ...      ...          ...   
1522060 2024-05-07 22:36:16  324.1  13.5   -2    325.0        0  05-08 03:00   
1522061 2024-05-07 22:57:05  324.2  13.3   -3    326.0        0  05-08 03:00   
1522062 2024-05-07 23:17:54  356.5  12.2   -1    354.0        0  05-08 03:00   
1522063 2024-05-07 23:38:13   52.6  17.3    3     50.0        0  05-08 03:00   
1522064 2024-05-07 23:59:01   53.6  17.7   -1     51.0        0  05-08 03:00   

         latitude  longitude  vesselId 

In [220]:
print(vesselID_mapping)


{'61e9f38eb937134a3c4bfd8b': 0, '61e9f38eb937134a3c4bfd8d': 1, '61e9f38eb937134a3c4bfd8f': 2, '61e9f38eb937134a3c4bfd91': 3, '61e9f390b937134a3c4bfd93': 4, '61e9f391b937134a3c4bfd95': 5, '61e9f391b937134a3c4bfd97': 6, '61e9f392b937134a3c4bfd99': 7, '61e9f392b937134a3c4bfd9b': 8, '61e9f393b937134a3c4bfd9d': 9, '61e9f393b937134a3c4bfd9f': 10, '61e9f393b937134a3c4bfda1': 11, '61e9f394b937134a3c4bfda3': 12, '61e9f394b937134a3c4bfda5': 13, '61e9f396b937134a3c4bfda9': 14, '61e9f396b937134a3c4bfdab': 15, '61e9f396b937134a3c4bfdad': 16, '61e9f397b937134a3c4bfdaf': 17, '61e9f398b937134a3c4bfdb1': 18, '61e9f398b937134a3c4bfdb3': 19, '61e9f399b937134a3c4bfdb5': 20, '61e9f399b937134a3c4bfdb7': 21, '61e9f39ab937134a3c4bfdb9': 22, '61e9f39ab937134a3c4bfdbb': 23, '61e9f39bb937134a3c4bfdbd': 24, '61e9f39cb937134a3c4bfdbf': 25, '61e9f39cb937134a3c4bfdc1': 26, '61e9f39cb937134a3c4bfdc3': 27, '61e9f39db937134a3c4bfdc5': 28, '61e9f39db937134a3c4bfdc7': 29, '61e9f39eb937134a3c4bfdc9': 30, '61e9f39eb937134a

In [221]:


# Function to prepare features and make predictions
def preporcess_test(new_data, previous_data=None, vesselID_mapping=None, portID_mapping=None):
    # Step 1: Prepare the DataFrame with default values
    new_data['cog'] = 0.0
    new_data['sog'] = 0.0
    new_data['rot'] = 0.0
    new_data['heading'] = 0.0
    new_data['navstat'] = 0  # Assuming navstat is an integer
    new_data['latitude_lag_1'] = 0.0
    new_data['longitude_lag_1'] = 0.0
    new_data['latitude_lag_2'] = 0.0
    new_data['longitude_lag_2'] = 0.0
    new_data['latitude_lag_3'] = 0.0
    new_data['longitude_lag_3'] = 0.0
    new_data['under_way'] = 0
    new_data['latitude_port'] = 0.0
    new_data['longitude_port'] = 0.0
    new_data['bearing_to_port'] = 0.0
    new_data['correct_direction'] = 0 
    new_data['portId'] = 0 
    new_data['latitude_port'] = 0  # Assuming port coordinates can be None
    new_data['longitude_port'] = 0
    new_data['time_since_last_seen'] = 0# Default for portId (assuming it's an integer)

    # Step 2: If historical data is available, use it to fill in values
    if previous_data is not None:
        for index, row in new_data.iterrows():
            vessel_id = row['vesselId']
            # Find the last entry for this vesselId in the previous data
            vessel_prev_data = previous_data[previous_data['vesselId'] == vessel_id]
            if not vessel_prev_data.empty:
                last_row = vessel_prev_data.iloc[-1]
                new_data.at[index, 'cog'] = last_row.get('cog', 0.0)  # Last known cog or default
                new_data.at[index, 'sog'] = last_row.get('sog', 0.0)  # Last known sog or default
                new_data.at[index, 'rot'] = last_row.get('rot', 0.0)  # Last known rot or default
                new_data.at[index, 'heading'] = last_row.get('heading', 0.0)  # Last known heading or default
                new_data.at[index, 'navstat'] = last_row.get('navstat', 0)  # Last known navstat or default
                new_data.at[index, 'latitude'] = last_row.get('latitude', 0.0)  # Last known latitude or default
                new_data.at[index, 'longitude'] = last_row.get('longitude', 0.0)  # Last known longitude or default
                new_data.at[index, 'portId'] = last_row.get('portId', 0)  # Last known portId or default

        
                new_data.at[index, 'latitude_lag_1'] = last_row.get('latitude_lag_1', 0.0)  # Last known latitude_lag_1 or default
                new_data.at[index, 'longitude_lag_1'] = last_row.get('longitude_lag_1', 0.0)  # Last known longitude_lag_1 or default
                new_data.at[index, 'latitude_lag_2'] = last_row.get('latitude_lag_2', 0.0)  # Last known latitude_lag_2 or default
                new_data.at[index, 'longitude_lag_2'] = last_row.get('longitude_lag_2', 0.0)  # Last known longitude_lag_2 or default
                new_data.at[index, 'latitude_lag_3'] = last_row.get('latitude_lag_3', 0.0)  # Last known latitude_lag_3 or default
                new_data.at[index, 'longitude_lag_3'] = last_row.get('longitude_lag_3', 0.0)  # Last known longitude_lag_3 or default
                new_data.at[index, 'under_way'] = last_row.get('under_way', 0)  # Last known under_way or default
                new_data.at[index, 'latitude_port'] = last_row.get('latitude_port', 0.0)  # Last known latitude_port or default
                new_data.at[index, 'longitude_port'] = last_row.get('longitude_port', 0.0)  # Last known longitude_port or default
                new_data.at[index, 'bearing_to_port'] = last_row.get('bearing_to_port', 0.0)  # Last known bearing_to_port or default
                new_data.at[index, 'correct_direction'] = last_row.get('correct_direction', 0)
                new_data.at[index, 'time_since_last_seen'] = last_row.get('time_since_last_seen', 0)  # Last known correct_direction or default
                

 # Assuming this function is defined

    if vesselID_mapping is not None:
    # Ensure vesselID_mapping is a dictionary
        if isinstance(vesselID_mapping, pd.Index):
            vesselID_mapping = vesselID_mapping.to_series().to_dict()
        
        # Apply mapping
        new_data['vesselId'] = new_data['vesselId'].map(vesselID_mapping)


    new_data = change_time_attr(new_data)
    new_data = time_since_seen(new_data)
    # Step 4: Prepare input features for predictions
    new_data = new_data.drop(columns=['ID', 'scaling_factor', 'latitude', 'longitude', 'time', 'etaRaw', 'heading'], errors='ignore')  # Drop irrelevant columns, ignore errors if columns are missing

    return new_data  # Debugging: print out columns to verify


# Load your new data here
new_data = pd.read_csv('data/datasets/ais_test.csv')  # Replace with your actual test data filename
previous_data = train2  # Load previous training data for defaults
test = preporcess_test(new_data, previous_data, vesselID_mapping, portID_mapping)
print(test.head())





   vesselId  cog  sog  rot  navstat  latitude_lag_1  longitude_lag_1  \
0        84  0.0  0.0  0.0        0             0.0              0.0   
1       623  0.0  0.0  0.0        0             0.0              0.0   
2       596  0.0  0.0  0.0        0             0.0              0.0   
3       542  0.0  0.0  0.0        0             0.0              0.0   
4         1  0.0  0.0  0.0        0             0.0              0.0   

   latitude_lag_2  longitude_lag_2  latitude_lag_3  ...  longitude_port  \
0             0.0              0.0             0.0  ...               0   
1             0.0              0.0             0.0  ...               0   
2             0.0              0.0             0.0  ...               0   
3             0.0              0.0             0.0  ...               0   
4             0.0              0.0             0.0  ...               0   

   bearing_to_port  correct_direction  portId  time_since_last_seen  month  \
0              0.0                  0 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_seen'].fillna(0, inplace=True)  # or you could choose to fill with some other value


In [222]:
def predict_lat_lon(df):
    predicted_lat = lat_model.predict(df)  # Assuming lat_model is your trained model
    predicted_lon = lon_model.predict(df)  # Assuming lon_model is your trained model

    
    predictions = pd.DataFrame({
        'ID': range(len(predicted_lat)), 
        'longitude_predicted': predicted_lon,  # Creating an ID column starting from 0
        'latitude_predicted': predicted_lat,
    })

    return predictions

expected_columns = ['cog', 'sog', 'rot', 'navstat', 'vesselId', 'portId', 'month', 'day', 'hour', 'minute', 'second', 'latitude_lag_1', 'longitude_lag_1', 'latitude_lag_2', 'longitude_lag_2', 'latitude_lag_3', 'longitude_lag_3', 'under_way', 'latitude_port', 'longitude_port', 'bearing_to_port', 'correct_direction', 'time_since_last_seen']

# Reorder new_data to match expected feature names
test_pred = test[expected_columns]
predictions = predict_lat_lon(test_pred)
print(predictions.head())

predictions.to_csv('predictions_1.csv', index=False)
print(len(predictions))

   ID  longitude_predicted  latitude_predicted
0   0            -0.906657           -4.367911
1   1            -0.874205           -4.330830
2   2            -0.874205           -4.364646
3   3            -0.896785           -4.364646
4   4            -0.898703           -3.111183
51739
