In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
import matplotlib.pyplot as plt





In [3]:
train = pd.read_csv('data/datasets/ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
train.info()
missing_values = train.isnull().sum()
print("Number of missing values in each column:\n", missing_values)


train.head()

test = pd.read_csv('data/datasets/ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
test.head()

vessels = pd.read_csv('data/datasets/vessels.csv', sep='|')
vessels.head()

ports = pd.read_csv('data/datasets/ports.csv', sep='|')
ports.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   time       1522065 non-null  datetime64[ns]
 1   cog        1522065 non-null  float64       
 2   sog        1522065 non-null  float64       
 3   rot        1522065 non-null  int64         
 4   heading    1522065 non-null  int64         
 5   navstat    1522065 non-null  int64         
 6   etaRaw     1522065 non-null  object        
 7   latitude   1522065 non-null  float64       
 8   longitude  1522065 non-null  float64       
 9   vesselId   1522065 non-null  object        
 10  portId     1520450 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 127.7+ MB
Number of missing values in each column:
 time            0
cog             0
sog             0
rot             0
heading         0
navstat         0
etaRaw          0

Unnamed: 0,portId,name,portLocation,longitude,latitude,UN_LOCODE,countryName,ISO
0,61d36ed80a1807568ff9a064,Port of Algiers,Algiers,3.067222,36.773611,DZALG,Algeria,DZ
1,61d36ed80a1807568ff9a065,Port of Annaba,Annaba,7.7725,36.900556,DZAAE,Algeria,DZ
2,61d36edf0a1807568ff9a070,Port of Oran,Oran,-0.639722,35.712222,DZORN,Algeria,DZ
3,61d36ee00a1807568ff9a072,Port of Skikda,Skikda,6.905833,36.8875,DZSKI,Algeria,DZ
4,61d36ee10a1807568ff9a074,Port of Pago-Pago,Pago-Pago,-170.690556,-14.274167,ASPPG,American Samoa,AS


In [4]:
# Filter the original DataFrame to find missing portId entries
missing_port_entries = train[train['portId'].isnull()]

# Group by 'vesselId' and calculate the total entries and missing portId counts
missing_port_analysis = missing_port_entries.groupby('vesselId').agg(
    total_missing=('portId', 'size'),  # Total missing entries per vessel
    missing_dates=('time', lambda x: list(x))  # List of timestamps when portId is missing
).reset_index()

# Calculate the total entries for each vessel
total_entries = train.groupby('vesselId').size().reset_index(name='total_entries')

# Merge the total entries back into the missing port analysis
missing_port_analysis = missing_port_analysis.merge(total_entries, on='vesselId', how='left')

# Calculate the percentage of missing portId for each vessel
missing_port_analysis['missing_percentage'] = (missing_port_analysis['total_missing'] / missing_port_analysis['total_entries']) * 100

# Sort the DataFrame by missing percentage in descending order
missing_port_analysis_sorted = missing_port_analysis.sort_values(by='missing_percentage', ascending=False)

# Display the sorted analysis with missing timestamps
print("Percentage of missing portId entries for each vessel (sorted):")
print(missing_port_analysis_sorted[['vesselId', 'total_missing', 'total_entries', 'missing_percentage', 'missing_dates']])

# Sort the DataFrame by vesselId
sorted_analysis = missing_port_analysis_sorted.sort_values(by='vesselId')
# Save the sorted DataFrame to a CSV file

sorted_analysis

Percentage of missing portId entries for each vessel (sorted):
                     vesselId  total_missing  total_entries  \
14  clh6aqawa0006gh0zje911dl3            691           5702   
9    61e9f443b937134a3c4c0197            224           2375   
5    61e9f403b937134a3c4c0011             81           1316   
12   61e9f465b937134a3c4c0269            347           6145   
2    61e9f3c9b937134a3c4bfef1             63           3709   
7    61e9f432b937134a3c4c0119             25           1614   
1    61e9f3c3b937134a3c4bfeb3             32           2101   
6    61e9f413b937134a3c4c0057            110           7503   
8    61e9f438b937134a3c4c0145             20           1666   
3    61e9f3c9b937134a3c4bfef5              5           1110   
0    61e9f393b937134a3c4bfd9d              2            683   
15  clh6aqawa0007gh0z9h6zi9bo             11           6665   
4    61e9f3e6b937134a3c4bff6b              1           1235   
11   61e9f464b937134a3c4c0263              1           

Unnamed: 0,vesselId,total_missing,missing_dates,total_entries,missing_percentage
0,61e9f393b937134a3c4bfd9d,2,"[2024-04-30 06:56:35, 2024-04-30 07:17:55]",683,0.292826
1,61e9f3c3b937134a3c4bfeb3,32,"[2024-02-03 15:54:42, 2024-02-03 16:15:40, 202...",2101,1.523084
2,61e9f3c9b937134a3c4bfef1,63,"[2024-02-29 11:58:05, 2024-02-29 12:16:30, 202...",3709,1.698571
3,61e9f3c9b937134a3c4bfef5,5,"[2024-02-20 22:38:59, 2024-02-20 22:59:01, 202...",1110,0.45045
4,61e9f3e6b937134a3c4bff6b,1,[2024-02-16 17:38:18],1235,0.080972
5,61e9f403b937134a3c4c0011,81,"[2024-03-12 23:12:10, 2024-03-13 00:14:46, 202...",1316,6.155015
6,61e9f413b937134a3c4c0057,110,"[2024-02-05 13:17:50, 2024-02-05 13:39:00, 202...",7503,1.46608
7,61e9f432b937134a3c4c0119,25,"[2024-02-19 18:36:27, 2024-02-19 18:56:33, 202...",1614,1.548947
8,61e9f438b937134a3c4c0145,20,"[2024-03-01 16:16:00, 2024-03-01 16:37:00, 202...",1666,1.20048
9,61e9f443b937134a3c4c0197,224,"[2024-01-23 02:55:06, 2024-01-23 03:15:42, 202...",2375,9.431579


In [5]:
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)

In [6]:
vessels.head()

Unnamed: 0,shippingLineId,vesselId,CEU,DWT,GT,NT,vesselType,breadth,depth,draft,enginePower,freshWater,fuel,homePort,length,maxHeight,maxSpeed,maxWidth,rampCapacity,yearBuilt
0,0,599.0,6500,21200.0,58684,17606.0,83.0,32.0,22.2,,0.0,,,OSLO,199.0,5.0,18.6,15.2,150.0,2000
1,1,65.0,4902,12325.0,46800,,83.0,31.0,,,14220.0,,,MONROVIA,182.0,,,,,2006
2,2,640.0,5000,13059.0,46800,,83.0,31.0,,,14220.0,,,SAINT JOHN'S,182.0,,,,,2010
3,3,255.0,4200,12588.0,39362,,83.0,28.0,,,11060.0,,,,167.0,,,,,2011
4,4,68.0,7450,21052.0,75528,24391.0,83.0,37.2,22.23,,13140.0,491.47,3236.78,Panama,199.98,,,,,2018


In [7]:


train = train.merge(ports[['portId', 'latitude', 'longitude']], how='left', left_on='portId', right_on='portId', suffixes=('', '_port'))
train.head()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817


In [8]:
train = train.merge(vessels[['vesselId', 'length', 'shippingLineId', 'maxSpeed', 'breadth', 'DWT', 'GT']], on='vesselId', how='left')
train['vessel_deep_sea'] = np.where(train['length'] > 200, 1, 0)

num_maxSpeed_nan = train['breadth'].isna().sum()
print(f"Number of NaN values in maxSpeed: {num_maxSpeed_nan}")
train.head()

Number of NaN values in maxSpeed: 14949


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port,length,shippingLineId,maxSpeed,breadth,DWT,GT,vessel_deep_sea
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889,199.0,9,,32.0,21214.0,57718,0
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533,199.97,6,22.5,32.26,18878.0,59583,0
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889,199.0,14,,32.0,18383.0,59217,0
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444,199.0,5,,32.0,15199.0,55598,0
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817,199.95,6,22.4,32.2,18833.0,58939,0


In [9]:
# Calculate the mean breadth for shipping line 5
mean_breadth_shipping_line_5 = train[train['shippingLineId'] == 5]['breadth'].mean()

# Impute missing breadth with the calculated mean
train.loc[train['shippingLineId'] == 5, 'breadth'] = train.loc[train['shippingLineId'] == 5, 'breadth'].fillna(mean_breadth_shipping_line_5)

# Verify if the missing values were filled
missing_count_after_imputation = train['breadth'].isnull().sum()
print(f"Missing Breadth values after imputation: {missing_count_after_imputation}")


Missing Breadth values after imputation: 0


In [10]:

def create_five_day_windows(df):
    # Ensure that 'time' column is in datetime format
    df['time'] = pd.to_datetime(df['time'])
    
    # Sort data to ensure time sequence within each vessel
    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    
    window_size_days = 5
    windows = []

    for vessel_id, group in df.groupby('vesselId'):
        group = group.sort_values(by='time')
        unique_dates = group['time'].dt.date.unique()

        for start_idx in range(len(unique_dates) - window_size_days + 1):
            start_date = unique_dates[start_idx]
            end_date = unique_dates[start_idx + window_size_days - 1]
            window = group[(group['time'].dt.date >= start_date) & (group['time'].dt.date <= end_date)]

            # Skip empty windows
            if len(window) == 0:
                continue

            # Reference row: the first row in the window
            reference_row = window.iloc[0]

            # Append each row in this window with reference features from the first row
            for _, row in window.iterrows():
                windows.append({
                    'vesselId': row['vesselId'],
                    'time': row['time'],
                    'latitude': row['latitude'],
                    'longitude': row['longitude'],
                    'cog_sin': row['cog_sin'],
                    'cog_cos': row['cog_cos'],
                    'sog': row['sog'],
                    'rot': row['rot'],
                    'under_way': row['under_way'],
                    'length' : row['length'],
                    'breadth': row['breadth'],
                    # 'DWT': row['DWT'],
                    'GT': row['GT'],
                    # 'vessel_deep_sea': row['vessel_deep_sea'],
                    # Features based on the first row in the window
                    'latitude_first': reference_row['latitude'],
                    'longitude_first': reference_row['longitude'],
                    'cog_sin_first': reference_row['cog_sin'],
                    'cog_cos_first': reference_row['cog_cos'],
                    'sog_first': reference_row['sog'],
                    'rot_first': reference_row['rot'],
                    'under_way_first': reference_row['under_way'],
                    'time_since_start': (row['time'] - reference_row['time']).total_seconds()
                })

    return pd.DataFrame(windows)

In [11]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    r = 6371  # Radius of Earth in kilometers
    return r * c  # Distance in kilometers

### Feature engineering 

In [12]:
def feature_engineering(train):
    train = train.copy()
    train = train.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

    # # Handle NaNs
    # train[['latitude', 'longitude', 'cog', 'time']] = (
    #     train.groupby('vesselId')[['latitude', 'longitude', 'cog', 'time']].apply(lambda x: x.ffill().bfill())
    # ).reset_index(drop=True)

    features = pd.DataFrame()
    features['vesselId'] = train['vesselId']
    features['time'] = train['time'] 
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['cog'] = train['cog']
    features['sog'] = train['sog']
    features['rot'] = train['rot']
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)
    features['cog_rad'] = np.radians(features['cog'])
    features['cog_sin'] = np.sin(features['cog_rad'])
    features['cog_cos'] = np.cos(features['cog_rad'])
    features['length'] = train['length']
    features['breadth'] = train['breadth']
    # features['DWT'] = train['DWT']
    features['GT'] = train['GT']
    # features['vessel_deep_sea'] = train['vessel_deep_sea']

    # Additional angle transformations
   

    # Calculate area covered within the time window
    features = create_five_day_windows(features)

    features['latitude_port'] = train['latitude_port']
    features['longitude_port'] = train['longitude_port']

    # Further feature engineering
    features['distance_to_port'] = haversine(train['latitude'], train['longitude'], train['latitude_port'], train['longitude_port'])
 
    # Additional time-based features
    features['month'] = features['time'].dt.month
    features['day'] = features['time'].dt.day
    features['hour'] = features['time'].dt.hour
    features['minute'] = features['time'].dt.minute
    # features['day_of_week'] = features['time'].dt.dayofweek

    return features

train_features = feature_engineering(train)

In [13]:
nan_values = train_features.isnull().sum()
print(nan_values)


# Drop the columns 'time' and 'rot'
train_features = train_features.drop(columns=['time', 'rot_first'])

vesselId                  0
time                      0
latitude                  0
longitude                 0
cog_sin                   0
cog_cos                   0
sog                       0
rot                       0
under_way                 0
length                    0
breadth                   0
GT                        0
latitude_first            0
longitude_first           0
cog_sin_first             0
cog_cos_first             0
sog_first                 0
rot_first                 0
under_way_first           0
time_since_start          0
latitude_port       5674055
longitude_port      5674055
distance_to_port    5674055
month                     0
day                       0
hour                      0
minute                    0
dtype: int64


In [14]:
train_features.head()

Unnamed: 0,vesselId,latitude,longitude,cog_sin,cog_cos,sog,rot,under_way,length,breadth,...,sog_first,under_way_first,time_since_start,latitude_port,longitude_port,distance_to_port,month,day,hour,minute
0,0,-34.7437,-57.8513,-0.970296,0.241922,0.7,0,1,199.0,32.0,...,0.7,1,0.0,-33.5875,-71.618889,1272.179414,1,1,0,0
1,0,-35.16787,-56.7721,0.998806,-0.04885,14.2,0,1,199.0,32.0,...,0.7,1,22123.0,-33.5875,-71.618889,1372.504428,1,1,6,9
2,0,-35.16863,-56.63185,0.999962,-0.008727,14.3,0,1,199.0,32.0,...,0.7,1,23874.0,-33.5875,-71.618889,1385.239318,1,1,6,38
3,0,-35.16805,-56.5319,0.999507,0.031411,14.3,0,1,199.0,32.0,...,0.7,1,25110.0,-33.5875,-71.618889,1394.309141,1,1,6,58
4,0,-35.16715,-56.45306,0.99956,0.029666,12.3,0,1,199.0,32.0,...,0.7,1,26131.0,-33.5875,-71.618889,1401.461476,1,1,7,15


In [15]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features = find_last_features(train_features)
# last_features.drop(columns=['DWT']) 
last_features.head()


Unnamed: 0,vesselId,latitude,longitude,cog_sin,cog_cos,sog,rot,under_way,length,breadth,...,sog_first,under_way_first,time_since_start,latitude_port,longitude_port,distance_to_port,month,day,hour,minute
0,0,34.57936,128.99926,-0.66262,-0.748956,15.5,0,1,199.0,32.0,...,0.4,0,1340708.0,-34.19,18.436944,56.735366,5,1,12,41
1,1,1.2446,103.39997,-0.81815,0.575005,15.7,11,1,199.97,32.26,...,0.1,0,2195733.0,12.624444,100.919722,50.185776,4,30,10,12
2,2,18.13873,-69.74863,0.069756,-0.997564,0.4,0,1,199.0,32.0,...,0.0,0,603974.0,53.344444,-6.209444,0.154213,5,7,23,59
3,3,41.64055,143.29942,0.999123,0.041876,14.4,0,1,199.0,32.0,...,9.7,1,455585.0,17.633,-101.55,143.465378,4,26,13,29
4,4,26.5871,121.27831,0.630676,0.776046,12.7,0,1,199.95,32.2,...,0.0,0,562766.0,53.524722,9.963333,2.303701,5,7,12,28


### Train the model

In [16]:
# # Define features and target
y = train_features[['latitude', 'longitude']]  
X = train_features.drop(columns=['latitude', 'longitude']) 
X.head()

Unnamed: 0,vesselId,cog_sin,cog_cos,sog,rot,under_way,length,breadth,GT,latitude_first,...,sog_first,under_way_first,time_since_start,latitude_port,longitude_port,distance_to_port,month,day,hour,minute
0,0,-0.970296,0.241922,0.7,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,0.0,-33.5875,-71.618889,1272.179414,1,1,0,0
1,0,0.998806,-0.04885,14.2,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,22123.0,-33.5875,-71.618889,1372.504428,1,1,6,9
2,0,0.999962,-0.008727,14.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,23874.0,-33.5875,-71.618889,1385.239318,1,1,6,38
3,0,0.999507,0.031411,14.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,25110.0,-33.5875,-71.618889,1394.309141,1,1,6,58
4,0,0.99956,0.029666,12.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,26131.0,-33.5875,-71.618889,1401.461476,1,1,7,15


In [17]:
best_model = RandomForestRegressor(max_depth=25, random_state=42)

# # Define the parameter grid for Random Forest
# param_grid = {
#     'n_estimators': [100, 300],            # Number of trees in the forest
#     'max_depth': [5, 7, 10, None],             # Maximum depth of the tree
#     'min_samples_split': [2, 5],           # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2],            # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt'],      # The number of features to consider when looking for the best split
#     'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
# }

# # Perform Randomized Search
# random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
#                                    n_iter=20, scoring='neg_mean_squared_error', cv=5, verbose=1, random_state=42)
# random_search.fit(X, y)

# best_model = random_search.best_estimator_

# # Output the best parameters and score
# print(f"Best parameters: {random_search.best_params_}")
# print(f"Best score (negative mean squared error): {-random_search.best_score_}")

# best_model = xgb.XGBRegressor()

# Fit the model
best_model.fit(X, y)


In [18]:

# Step 1: Get feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame to display feature importances
features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Step 2: Print feature importances
print("Feature Importances:")
print(features_df)

Feature Importances:
             Feature  Importance
10   longitude_first    0.738036
15  time_since_start    0.115233
9     latitude_first    0.079267
8                 GT    0.011913
11     cog_sin_first    0.010563
0           vesselId    0.007009
13         sog_first    0.006201
6             length    0.005869
7            breadth    0.004801
12     cog_cos_first    0.004087
1            cog_sin    0.003663
20               day    0.003640
3                sog    0.003534
19             month    0.002246
2            cog_cos    0.001107
16     latitude_port    0.001106
17    longitude_port    0.000665
5          under_way    0.000327
18  distance_to_port    0.000274
14   under_way_first    0.000219
21              hour    0.000122
4                rot    0.000090
22            minute    0.000028


In [26]:
X.head()

Unnamed: 0,vesselId,cog_sin,cog_cos,sog,rot,under_way,length,breadth,GT,latitude_first,...,sog_first,under_way_first,time_since_start,latitude_port,longitude_port,distance_to_port,month,day,hour,minute
0,0,-0.970296,0.241922,0.7,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,0.0,-33.5875,-71.618889,1272.179414,1,1,0,0
1,0,0.998806,-0.04885,14.2,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,22123.0,-33.5875,-71.618889,1372.504428,1,1,6,9
2,0,0.999962,-0.008727,14.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,23874.0,-33.5875,-71.618889,1385.239318,1,1,6,38
3,0,0.999507,0.031411,14.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,25110.0,-33.5875,-71.618889,1394.309141,1,1,6,58
4,0,0.99956,0.029666,12.3,0,1,199.0,32.0,57718,-34.7437,...,0.7,1,26131.0,-33.5875,-71.618889,1401.461476,1,1,7,15


In [28]:
expected_features = X.columns 

### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last features
    last_features['year'] = 2024
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute']]
    )

    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen features to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))

    print(prepared_test.head())

    # Move the last_features to the reference row in the windows
    prepared_test['latitude_first'] = prepared_test['latitude']
    prepared_test['longitude_first'] = prepared_test['longitude']
    prepared_test['cog_sin_first'] = prepared_test['cog_sin']
    prepared_test['cog_cos_first'] = prepared_test['cog_cos']  # Fixed typo here
    prepared_test['sog_first'] = prepared_test['sog']
    prepared_test['under_way_first'] = prepared_test['under_way']
    prepared_test['time_since_start'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # Split the time column into month, day, hour, minute
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute

    # prepared_test['length'] = prepared_test['length']  # Ensure this column exists in 'prepared_test'
    # prepared_test['breadth'] = prepared_test['breadth']  # Ensure this column exists in 'prepared_test'
    # # prepared_test['DWT'] = prepared_test['DWT']  # Ensure this column exists in 'prepared_test'
    # prepared_test['GT'] = prepared_test['GT']  # Ensure this column exists in 'prepared_test'
    # prepared_test['vessel_deep_sea'] = prepared_test['vessel_deep_sea']  # Ensure this column exists in 'prepared_test'
    # prepared_test['rot'] = prepared_test['rot']  # Ensure this column exists in 'prepared_test'
    # # prepared_test['day_of_week'] = prepared_test['time'].dt.dayofweek  # Added missing day_of_week calculation

    # Drop the columns that are no longer needed
    prepared_test.drop(['time', 'time_last', 'year'], axis=1, inplace=True)

    # Reorder the columns
    for column in expected_features:
        if column not in prepared_test.columns:
            prepared_test[column] = 0  # Assign a default value (0) for missing columns

    return prepared_test

# Prepare the test DataFrame
test_df = prepare_test_for_predictions(test, last_features)
print(test_df.head())
print(test_df.shape)


   vesselId                time  latitude  longitude   cog_sin   cog_cos  \
0       412 2024-05-08 00:03:16  31.14647  -81.49789  0.006981 -0.999976   
1       373 2024-05-08 00:06:17  14.81694  120.29625  0.417867  0.908508   
2       181 2024-05-08 00:10:02  38.27895   10.78280  0.139173  0.990268   
3         8 2024-05-08 00:10:34 -43.53785  172.83522 -0.625243  0.780430   
4        65 2024-05-08 00:12:27  48.53320   -6.12003 -0.933580  0.358368   

    sog  rot  under_way  length  ...  time_since_start  latitude_port  \
0   0.0    0          0   230.0  ...          516824.0            NaN   
1   0.0    0          0   124.0  ...          776687.0            NaN   
2  18.7    0          1   186.0  ...          426241.0            NaN   
3   0.1    0          0   183.0  ...          430552.0      53.952222   
4   0.3    0          0   182.0  ...          430370.0      33.608889   

   longitude_port  distance_to_port  month  day  hour  minute  year  \
0             NaN               N

In [24]:
predictions = best_model.predict(test_df)
print(predictions)
print(predictions.shape)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- latitude
- longitude


In [None]:
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_ing_wfix.csv', index=False)

In [None]:

# Assuming 'test' and 'predictions_df' are already defined and contain the necessary data
merged_df = pd.merge(test, predictions_df, on='ID', how='left')

# Print the columns to verify
print("Columns in merged_df before dropping:", merged_df.columns.tolist())

# Drop the specified columns, checking if they exist first
columns_to_drop = ['ID', 'scaling_factor']
for col in columns_to_drop:
    if col in merged_df.columns:
        merged_df.drop(col, axis=1, inplace=True)

# Alternatively, you can drop them directly with error handling
# merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)

# Print the columns after the drop
print("Columns in merged_df after dropping:", merged_df.columns.tolist())

# Display the first few rows of the DataFrame
print(merged_df.head())

In [None]:
predictions_df.to_csv('data/submissions/predictions_2_5d_vm.csv', index=False)

In [None]:
merged_df.to_csv('data/submissions/plotting_boats.csv')