# A bit of feature engineering

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [4]:
train = pd.read_csv('ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
print(train.head())

test = pd.read_csv('ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
print(test.head())

vessels = pd.read_csv('vessels.csv', sep='|')
print(vessels.head())

ports = pd.read_csv('ports.csv', sep='|')
print(ports.head())

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-01 00:00:25  284.0   0.7    0       88        0  01-09 23:00   
1 2024-01-01 00:00:36  109.6   0.0   -6      347        1  12-29 20:00   
2 2024-01-01 00:01:45  111.0  11.0    0      112        0  01-02 09:00   
3 2024-01-01 00:03:11   96.4   0.0    0      142        1  12-31 20:00   
4 2024-01-01 00:03:51  214.0  19.7    0      215        0  01-25 12:00   

   latitude  longitude                  vesselId                    portId  
0 -34.74370  -57.85130  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f  
1   8.89440  -79.47939  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689  
2  39.19065  -76.47567  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19  
3 -34.41189  151.02067  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126  
4  35.88379   -5.91636  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3  
   ID                  vesselId                time  scaling_factor
0   0  61e9f3aeb937134a3c

In [5]:
# Mappings for IDs
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)

vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)



In [6]:
def rolling_window_outlier_detection(df, window_size=10, threshold=0.2):
    rolling_mean = df.rolling(window=window_size).mean()
    rolling_std = df.rolling(window=window_size).std()
    z_scores = (df - rolling_mean) / rolling_std
    return z_scores.abs() > threshold

outliers = train.groupby('vesselId')['cog'].apply(rolling_window_outlier_detection)
print(outliers)
num_outliers = outliers.sum()
print(f"Number of outliers: {num_outliers}")

#print(rolling_window_outlier_detection(train['latitude'].groupby('vesselId').apply(lambda x: rolling_window_outlier_detection(x['latitude']))))


vesselId         
0         0          False
          2672       False
          2947       False
          3093       False
          3140       False
                     ...  
687       1476061     True
          1476272    False
          1476362    False
          1476576     True
          1476639     True
Name: cog, Length: 1522065, dtype: bool
Number of outliers: 1136907


In [9]:
def feature_engineering(test, train):
    test = test.copy()
    train = train.copy()
    features = pd.DataFrame()

    # Add the columns vesselId, time, latitude and longitude to the features from train
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']

    # Add the columns last_longitude and last_latitude for every row in train
    features['last_longitude'] = train.groupby('vesselId')['longitude'].shift()
    features['last_latitude'] = train.groupby('vesselId')['latitude'].shift()

    # Remove the first row for every vesselId
    features = features.dropna()

    # Create a new binary column for 'under_way' based on the navstat column in train. 
    # 1 if navstat is 0 or 8, 0 otherwise
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)

    # Add the column cog to the features from train
    features['cog'] = train['cog']

    # Add the column sog to the features from train
    features['sog'] = train['sog']
    
    #
    # Split the time column into month, day, hour, minute and second columns
    features['month'] = train['time'].dt.month
    features['day'] = train['time'].dt.day
    features['hour'] = train['time'].dt.hour
    features['minute'] = train['time'].dt.minute
    features['second'] = train['time'].dt.second

    features.drop('time', axis=1, inplace=True)

    return features

def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last()
    return last_features

features = feature_engineering(test, train)
last_samples = find_last_features(features)
print(last_samples)


          latitude  longitude  last_longitude  last_latitude  under_way  \
vesselId                                                                  
0         34.57936  128.99926       129.01917       34.59684          1   
1          1.24460  103.39997       103.46804        1.22186          1   
2         18.13873  -69.74863       -69.74807       18.14185          1   
3         41.64055  143.29942       141.92751       41.63254          1   
4         26.58710  121.27831       121.23948       26.54636          1   
...            ...        ...             ...            ...        ...   
683       26.93694 -123.62850      -123.35737       26.87353          1   
684       36.64848  125.89047       125.90214       36.68664          1   
685      -33.86094   17.43360        17.94284      -34.35082          1   
686       30.33982  -88.57598       -88.57597       30.33982          0   
687       37.22231  144.98452       144.94732       37.20409          1   

            cog   sog  m

In [10]:
# Define features and target
y = features[['latitude', 'longitude']]  
X = features.drop(columns=['latitude', 'longitude'])  

In [11]:
# Initialize the Random Forest model
rf_model = RandomForestRegressor()

# Fit the model
rf_model.fit(X, y)

In [24]:
print(y.head())
print(X.head())

     latitude  longitude
143 -26.77586  153.23453
145  38.47387   15.91592
146  33.90815  130.92404
148  43.44237   -3.82309
149  40.68658   29.31613
     vesselId  last_longitude  last_latitude  under_way    cog   sog  month  \
143        17       153.23435      -26.77612          0   73.9   0.1      1   
145       134        15.91595       38.47387          1  150.7   0.1      1   
146        44       130.90204       33.90403          1   56.4  12.9      1   
148        11        -3.82316       43.44238          0  114.5   0.0      1   
149        85        29.31601       40.68679          0  210.4   0.0      1   

     day  hour  minute  second  
143    1     0      17      17  
145    1     0      18      57  
146    1     0      19      39  
148    1     0      20      35  
149    1     0      21      37  


In [31]:
def prepare_test_for_predictions(test, last_samples):
    test = test.copy()
    last_samples = last_samples.copy()
    
    last_samples =last_samples.drop(columns=['last_longitude', 'last_latitude', 'month', 'day', 'hour', 'minute', 'second']).copy()

    # The last_longitude and last_latitude in test are going to be the longitude and latitude from last_samples for each vessel
    test = test.merge(last_samples, on='vesselId', how='left', suffixes=('', '_last'))

    # Rename the columns latitude and longitude to last_latitude and last_longitude
    test.rename(columns={'longitude': 'last_longitude', 'latitude': 'last_latitude'}, inplace=True)
    # Change the order of the columns last_longitude and last_latitude
    cols = list(test.columns)
    last_long_idx = cols.index('last_longitude')
    last_lat_idx = cols.index('last_latitude')
    cols[last_long_idx], cols[last_lat_idx] = cols[last_lat_idx], cols[last_long_idx]
    test = test[cols]

    # Fix the time column
    test['month'] = test['time'].dt.month
    test['day'] = test['time'].dt.day
    test['hour'] = test['time'].dt.hour
    test['minute'] = test['time'].dt.minute
    test['second'] = test['time'].dt.second
    test.drop('time', axis=1, inplace=True)

    test.drop('scaling_factor', axis=1, inplace=True)
    test.drop('ID', axis=1, inplace=True)

    return test

In [32]:
# Do predictions for the test set
test_features = prepare_test_for_predictions(test, last_samples)
print(test_features.head())
predictions = rf_model.predict(test_features)
print(predictions)

   vesselId  last_longitude  last_latitude  under_way    cog   sog  month  \
0       412       -81.49789       31.14647          0  179.6   0.0      5   
1       373       120.29625       14.81694          0   24.7   0.0      5   
2       181        10.78280       38.27895          1    8.0  18.7      5   
3         8       172.83522      -43.53785          0  321.3   0.1      5   
4        65        -6.12003       48.53320          0  291.0   0.3      5   

   day  hour  minute  second  
0    8     0       3      16  
1    8     0       6      17  
2    8     0      10       2  
3    8     0      10      34  
4    8     0      12      27  
[[ 31.14631651 -81.49787394]
 [ 14.9729479  120.5270161 ]
 [ 38.3487782   10.8158805 ]
 ...
 [ 36.7600096   96.3787963 ]
 [ 58.97784     21.5363864 ]
 [ 56.1507451   10.2337676 ]]


In [34]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_2.csv', index=False)