# A bit of feature engineering
## Adding time since last signal to the features

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
print(train.head())

test = pd.read_csv('ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
print(test.head())

vessels = pd.read_csv('vessels.csv', sep='|')
print(vessels.head())

ports = pd.read_csv('ports.csv', sep='|')
print(ports.head())

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-01 00:00:25  284.0   0.7    0       88        0  01-09 23:00   
1 2024-01-01 00:00:36  109.6   0.0   -6      347        1  12-29 20:00   
2 2024-01-01 00:01:45  111.0  11.0    0      112        0  01-02 09:00   
3 2024-01-01 00:03:11   96.4   0.0    0      142        1  12-31 20:00   
4 2024-01-01 00:03:51  214.0  19.7    0      215        0  01-25 12:00   

   latitude  longitude                  vesselId                    portId  
0 -34.74370  -57.85130  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f  
1   8.89440  -79.47939  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689  
2  39.19065  -76.47567  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19  
3 -34.41189  151.02067  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126  
4  35.88379   -5.91636  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3  
   ID                  vesselId                time  scaling_factor
0   0  61e9f3aeb937134a3c

In [3]:
# Mappings for IDs
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)



In [5]:
def feature_engineering(test, train):
    test = test.copy()
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude and longitude to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']

    # Calculate time since last signal for each vessel
    features['time_since_last_seen'] = features.groupby('vesselId')['time'].diff().dt.total_seconds()

    # Add the columns last_longitude and last_latitude for every row in train
    features['last_longitude'] = train.groupby('vesselId')['longitude'].shift()
    features['last_latitude'] = train.groupby('vesselId')['latitude'].shift()

    # Remove the first row for every vesselId
    features = features.dropna()

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Add the column cog to the features from train
    features['cog'] = train['cog']

    # Add the column sog to the features from train
    features['sog'] = train['sog']

    
    
    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop('time', axis=1, inplace=True)

    return features

def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last()
    return last_features

features = feature_engineering(test, train)
last_samples = find_last_features(features)
print(features.groupby('vesselId').head())


        vesselId  latitude  longitude  time_since_last_seen  last_longitude  \
143           17 -26.77586  153.23453                 361.0       153.23435   
145          134  38.47387   15.91592                 159.0        15.91595   
146           44  33.90815  130.92404                 334.0       130.90204   
148           11  43.44237   -3.82309                 720.0        -3.82316   
149           85  40.68658   29.31613                 360.0        29.31601   
...          ...       ...        ...                   ...             ...   
805661       335  36.98805  126.14394                1260.0       126.06407   
805814       335  37.01741  126.22313                1242.0       126.14394   
805922       335  37.04801  126.28195                1170.0       126.22313   
918933       581  36.93893  126.05745             5923375.0        18.24441   
919099       581  36.97329  126.13773                 900.0       126.05745   

        last_latitude  under_way    cog   sog  mont

In [7]:
# Define features and target
y = features[['latitude', 'longitude']]  
X = features.drop(columns=['latitude', 'longitude'])  

In [8]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor()

# Fit the model
rf_model.fit(X, y)

In [13]:
print(y.head())
print(X.head())

     latitude  longitude
143 -26.77586  153.23453
145  38.47387   15.91592
146  33.90815  130.92404
148  43.44237   -3.82309
149  40.68658   29.31613
     vesselId  time_since_last_seen  last_longitude  last_latitude  under_way  \
143        17                 361.0       153.23435      -26.77612          0   
145       134                 159.0        15.91595       38.47387          1   
146        44                 334.0       130.90204       33.90403          1   
148        11                 720.0        -3.82316       43.44238          0   
149        85                 360.0        29.31601       40.68679          0   

       cog   sog  month  day  hour  minute  second  
143   73.9   0.1      1    1     0      17      17  
145  150.7   0.1      1    1     0      18      57  
146   56.4  12.9      1    1     0      19      39  
148  114.5   0.0      1    1     0      20      35  
149  210.4   0.0      1    1     0      21      37  


In [44]:
print(last_samples.head())

def prepare_test_for_predictions(test, last_samples):
    test = test.copy()
    last_samples = last_samples.copy()

    # Create a time column in last samples
    last_samples['year'] = 2024
    last_samples['time'] = pd.to_datetime(
        last_samples[['year', 'month', 'day', 'hour', 'minute', 'second']]
    )
    
    last_samples =last_samples.drop(columns=['last_longitude', 'last_latitude', 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_since_last_seen']).copy()

    # The last_longitude and last_latitude in test are going to be the longitude and latitude from last_samples for each vessel
    test = test.merge(last_samples, on='vesselId', how='left', suffixes=('', '_last'))

    # Calculate time since last seen for each row
    test['time_since_last_seen'] = (test['time'] - test['time_last']).dt.total_seconds()

    #

    # Rename the columns latitude and longitude to last_latitude and last_longitude
    test.rename(columns={'longitude': 'last_longitude', 'latitude': 'last_latitude'}, inplace=True)
    # Change the order of the columns last_longitude and last_latitude
    cols = list(test.columns)
    last_long_idx = cols.index('last_longitude')
    last_lat_idx = cols.index('last_latitude')
    cols[last_long_idx], cols[last_lat_idx] = cols[last_lat_idx], cols[last_long_idx]
    test = test[cols]

    # Move "time_since_last_seen" before "last_longitude"
    cols = list(test.columns)
    time_since_last_seen_idx = cols.index('time_since_last_seen')
    last_long_idx = cols.index('last_longitude')
    cols.insert(last_long_idx, cols.pop(time_since_last_seen_idx))
    test = test[cols]

    # Fix the time column
    test['month'] = test['time'].dt.month
    test['day'] = test['time'].dt.day
    test['hour'] = test['time'].dt.hour
    test['minute'] = test['time'].dt.minute
    test['second'] = test['time'].dt.second
    test.drop('time', axis=1, inplace=True)

    test.drop('scaling_factor', axis=1, inplace=True)
    test.drop('ID', axis=1, inplace=True)
    test.drop('time_last', axis=1, inplace=True)

    return test


          latitude  longitude  time_since_last_seen  last_longitude  \
vesselId                                                              
0         34.57936  128.99926                 335.0       129.01917   
1          1.24460  103.39997                1007.0       103.46804   
2         18.13873  -69.74863                1250.0       -69.74807   
3         41.64055  143.29942               14489.0       141.92751   
4         26.58710  121.27831                 908.0       121.23948   

          last_latitude  under_way    cog   sog  month  day  hour  minute  \
vesselId                                                                    
0              34.59684          1  221.5  15.5      5    1    12      41   
1               1.22186          1  305.1  15.7      4   30    10      12   
2              18.14185          1  176.0   0.4      5    7    23      59   
3              41.63254          1   87.6  14.4      4   26    13      29   
4              26.54636          1   39.

In [45]:
# Do predictions for the test set
print(X.head())
test_features = prepare_test_for_predictions(test, last_samples)
print(test_features.head())
predictions = rf_model.predict(test_features)
print(predictions)

     vesselId  time_since_last_seen  last_longitude  last_latitude  under_way  \
143        17                 361.0       153.23435      -26.77612          0   
145       134                 159.0        15.91595       38.47387          1   
146        44                 334.0       130.90204       33.90403          1   
148        11                 720.0        -3.82316       43.44238          0   
149        85                 360.0        29.31601       40.68679          0   

       cog   sog  month  day  hour  minute  second  
143   73.9   0.1      1    1     0      17      17  
145  150.7   0.1      1    1     0      18      57  
146   56.4  12.9      1    1     0      19      39  
148  114.5   0.0      1    1     0      20      35  
149  210.4   0.0      1    1     0      21      37  
   vesselId  time_since_last_seen  last_longitude  last_latitude  under_way  \
0       412                 900.0       -81.49789       31.14647          0   
1       373                 541.0    

In [47]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_3.csv', index=False)