# Test 12
- LSTM
- Kaggle score: 138.27419
- Correlating csv file: predictions_12.

In [2]:
import pandas as pd
import numpy as np

## Data preprocessing

### Retrieving data 

In [None]:
train = pd.read_csv('ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])
train.info()
missing_values = train.isnull().sum()
print("Number of missing values in each column:\n", missing_values)

train.head()

test = pd.read_csv('ais_test.csv', sep=',')
test['time'] = pd.to_datetime(test['time'])
test.head()

vessels = pd.read_csv('vessels.csv', sep='|')
vessels.head()

ports = pd.read_csv('ports.csv', sep='|')
ports.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   time       1522065 non-null  datetime64[ns]
 1   cog        1522065 non-null  float64       
 2   sog        1522065 non-null  float64       
 3   rot        1522065 non-null  int64         
 4   heading    1522065 non-null  int64         
 5   navstat    1522065 non-null  int64         
 6   etaRaw     1522065 non-null  object        
 7   latitude   1522065 non-null  float64       
 8   longitude  1522065 non-null  float64       
 9   vesselId   1522065 non-null  object        
 10  portId     1520450 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 127.7+ MB
Number of missing values in each column:
 time            0
cog             0
sog             0
rot             0
heading         0
navstat         0
etaRaw          0

Unnamed: 0,portId,name,portLocation,longitude,latitude,UN_LOCODE,countryName,ISO
0,61d36ed80a1807568ff9a064,Port of Algiers,Algiers,3.067222,36.773611,DZALG,Algeria,DZ
1,61d36ed80a1807568ff9a065,Port of Annaba,Annaba,7.7725,36.900556,DZAAE,Algeria,DZ
2,61d36edf0a1807568ff9a070,Port of Oran,Oran,-0.639722,35.712222,DZORN,Algeria,DZ
3,61d36ee00a1807568ff9a072,Port of Skikda,Skikda,6.905833,36.8875,DZSKI,Algeria,DZ
4,61d36ee10a1807568ff9a074,Port of Pago-Pago,Pago-Pago,-170.690556,-14.274167,ASPPG,American Samoa,AS


### Creating ID-mappings

In [4]:
port_id_mapping = {port_id: idx for idx, port_id in enumerate(train['portId'].unique())}
train['portId'] = train['portId'].map(port_id_mapping)
ports['portId'] = ports['portId'].map(port_id_mapping)


vessel_id_mapping = {vessel_id: idx for idx, vessel_id in enumerate(train['vesselId'].unique())}
train['vesselId'] = train['vesselId'].map(vessel_id_mapping)
vessels['vesselId'] = vessels['vesselId'].map(vessel_id_mapping)
test['vesselId'] = test['vesselId'].map(vessel_id_mapping)

# Create a DataFrame to visualize the vessel ID mapping
vessel_id_mapping_df = pd.DataFrame(list(vessel_id_mapping.items()), columns=['Original Vessel ID', 'Mapped Vessel ID'])


shipping_line_id_mapping = {shipping_line_id: idx for idx, shipping_line_id in enumerate(vessels['shippingLineId'].unique())}
vessels['shippingLineId'] = vessels['shippingLineId'].map(shipping_line_id_mapping)



In [5]:
train = train.merge(ports[['portId', 'latitude', 'longitude']], how='left', left_on='portId', right_on='portId', suffixes=('', '_port'))
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,latitude_port,longitude_port
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,0,0,-33.5875,-71.618889
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,1,1,8.967,-79.533
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,2,2,39.2325,-76.558889
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,3,3,-34.4625,150.899444
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,4,4,35.783,-5.817


## Feature engineering

In [None]:
# Function to create the five-day-window features

def create_five_day_windows(df):
    # Ensure that 'time' column is in datetime format
    df['time'] = pd.to_datetime(df['time'])
    
    # Sort data to ensure time sequence within each vessel
    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    
    window_size_days = 5
    windows = []

    for vessel_id, group in df.groupby('vesselId'):
        group = group.sort_values(by='time')
        unique_dates = group['time'].dt.date.unique()

        for start_idx in range(len(unique_dates) - window_size_days + 1):
            start_date = unique_dates[start_idx]
            end_date = unique_dates[start_idx + window_size_days - 1]
            window = group[(group['time'].dt.date >= start_date) & (group['time'].dt.date <= end_date)]

            # Skip empty windows
            if len(window) == 0:
                continue

            # Reference row: the first row in the window
            reference_row = window.iloc[0]

            # Append each row in this window with reference features from the first row
            for _, row in window.iterrows():
                windows.append({
                    'vesselId': row['vesselId'],
                    'time': row['time'],
                    'latitude': row['latitude'],
                    'longitude': row['longitude'],
                    'cog': row['cog'],
                    'sog': row['sog'],
                    'rot': row['rot'],
                    'under_way': row['under_way'],
                    # Features based on the first row in the window
                    'latitude_first': reference_row['latitude'],
                    'longitude_first': reference_row['longitude'],
                    'cog_first': reference_row['cog'],
                    'sog_first': reference_row['sog'],
                    'rot_first': reference_row['rot'],
                    'under_way_first': reference_row['under_way'],
                    'time_since_start': (row['time'] - reference_row['time']).total_seconds()
                })

    return pd.DataFrame(windows)



In [None]:
# create features to train the model.
def feature_engineering(train):
    train = train.copy()

    # Sort data to ensure time sequence within each vessel
    train = train.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

    # Handle NaNs by filling forward then backward within each vessel's data
    train[['latitude', 'longitude', 'cog', 'time']] = (
        train.groupby('vesselId')[['latitude', 'longitude', 'cog', 'time']].apply(lambda x: x.ffill().bfill())
    ).reset_index(drop=True)

    features = pd.DataFrame()
    features['vesselId'] = train['vesselId']
    features['time'] = train['time']
    features['latitude'] = train['latitude']
    features['longitude'] = train['longitude']
    features['cog'] = train['cog']
    features['sog'] = train['sog']
    features['rot'] = train['rot']
    features['under_way'] = train['navstat'].isin([0, 8]).astype(int)
    
    # Create five-day sliding windows for each vessel
    features = create_five_day_windows(features)
    

    # Additional time and angle transformations
    features['month'] = features['time'].dt.month
    features['day'] = features['time'].dt.day
    features['hour'] = features['time'].dt.hour
    features['minute'] = features['time'].dt.minute

    return features

train_features = feature_engineering(train)

In [None]:
# Making sure the features are as they should be

nan_values = train_features.isnull().sum()
print(nan_values)

# Drop the columns 'time' and 'rot'
train_features = train_features.drop(columns=['time', 'rot', 'rot_first'])

# Drop the rows with missing values
train_features = train_features.dropna().reset_index(drop=True)

nan_values = train_features.isnull().sum()
print(nan_values)

print(train_features.head())


vesselId            0
time                0
latitude            0
longitude           0
cog                 0
sog                 0
rot                 0
under_way           0
latitude_first      0
longitude_first     0
cog_first           0
sog_first           0
rot_first           0
under_way_first     0
time_since_start    0
month               0
day                 0
hour                0
minute              0
dtype: int64
vesselId            0
latitude            0
longitude           0
cog                 0
sog                 0
under_way           0
latitude_first      0
longitude_first     0
cog_first           0
sog_first           0
under_way_first     0
time_since_start    0
month               0
day                 0
hour                0
minute              0
dtype: int64
   vesselId  latitude  longitude    cog   sog  under_way  latitude_first  \
0         0 -34.74370  -57.85130  284.0   0.7          1        -34.7437   
1         0 -35.16787  -56.77210   92.8  14.2       

## Train the model

In [10]:
def find_last_features(features):
    # Assuming that the last row for every vesselId is the most recent
    last_features = features.groupby('vesselId').last().reset_index()
    return last_features

last_features = find_last_features(train_features)
last_features.head()

Unnamed: 0,vesselId,latitude,longitude,cog,sog,under_way,latitude_first,longitude_first,cog_first,sog_first,under_way_first,time_since_start,month,day,hour,minute
0,0,34.57936,128.99926,221.5,15.5,1,47.54253,-122.52499,53.8,0.4,0,1340708.0,5,1,12,41
1,1,1.2446,103.39997,305.1,15.7,1,24.98448,55.06391,81.6,0.1,0,2195733.0,4,30,10,12
2,2,18.13873,-69.74863,176.0,0.4,1,40.69757,-74.1509,0.0,0.0,0,603974.0,5,7,23,59
3,3,41.64055,143.29942,87.6,14.4,1,36.84741,125.8366,60.8,9.7,1,455585.0,4,26,13,29
4,4,26.5871,121.27831,39.1,12.7,1,1.26628,103.85164,341.0,0.0,0,562766.0,5,7,12,28


In [11]:
# Define features and target of heading model
y = train_features[['latitude', 'longitude']]  
X = train_features.drop(columns=['latitude', 'longitude']) 

In [15]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Normalize the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape input to be 3D [samples, timesteps, features]
X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_scaled.shape[1], X_scaled.shape[2])))
model.add(Dense(2))  # Output layer with 2 neurons for latitude and longitude

model.compile(optimizer='adam', loss='mse')

# Fit the model
model.fit(X_scaled, y, epochs=50, batch_size=32, verbose=1)

  super().__init__(**kwargs)


Epoch 1/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 446us/step - loss: 511.6123
Epoch 2/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 439us/step - loss: 264.0109
Epoch 3/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 426us/step - loss: 244.5577
Epoch 4/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 435us/step - loss: 230.0550
Epoch 5/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 435us/step - loss: 218.1269
Epoch 6/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 428us/step - loss: 210.3071
Epoch 7/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 450us/step - loss: 204.7584
Epoch 8/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 433us/step - loss: 198.1794
Epoch 9/50
[1m224829/224829[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 428us/step - loss:

<keras.src.callbacks.history.History at 0x454857610>

## Preparing test data for prediction

In [16]:
print(X.keys())
'''
['vesselId', 'cog', 'sog', 'under_way', 'latitude_first',
       'longitude_first', 'cog_first', 'sog_first', 'under_way_first',
       'time_since_start', 'month', 'day', 'hour', 'minute']
'''

### Prepare test data for predictions
def prepare_test_for_predictions(test, last_features):
    test = test.copy()
    prepared_test = pd.DataFrame()

    # Create a time column in last features
    last_features['year'] = 2024
    last_features['time'] = pd.to_datetime(
        last_features[['year', 'month', 'day', 'hour', 'minute']]
    )

    # Add the columns vesselId and time
    prepared_test['vesselId'] = test['vesselId']
    prepared_test['time'] = test['time']

    # For each vessel, add the last seen features to the prepared test
    prepared_test = prepared_test.merge(last_features, on='vesselId', how='left', suffixes=('', '_last'))

    print(prepared_test.head())

    # Move the last_features to the refrence row in the windows
    prepared_test['latitude_first'] = prepared_test['latitude']
    prepared_test['longitude_first'] = prepared_test['longitude']
    prepared_test['cog_first'] = prepared_test['cog']
    prepared_test['sog_first'] = prepared_test['sog']
    prepared_test['under_way_first'] = prepared_test['under_way']
    prepared_test['time_since_start'] = (prepared_test['time'] - prepared_test['time_last']).dt.total_seconds()

    # Split the time column into month, day, hour, minute and second columns
    prepared_test['month'] = test['time'].dt.month
    prepared_test['day'] = test['time'].dt.day
    prepared_test['hour'] = test['time'].dt.hour
    prepared_test['minute'] = test['time'].dt.minute
    # prepared_test['second'] = test['time'].dt.second

    prepared_test.drop('time', axis=1, inplace=True)
    prepared_test.drop('time_last', axis=1, inplace=True)
    prepared_test.drop('year', axis=1, inplace=True)

    # Reorder the columns
    prepared_test = prepared_test[['vesselId', 'cog', 'sog', 'under_way',
                                   'latitude_first', 'longitude_first', 'cog_first', 'sog_first',
                                     'under_way_first', 'time_since_start', 'month', 'day',
                                   'hour', 'minute']]

    return prepared_test

test_df = prepare_test_for_predictions(test, last_features)
print(test_df.head())
print(test_df.shape)


Index(['vesselId', 'cog', 'sog', 'under_way', 'latitude_first',
       'longitude_first', 'cog_first', 'sog_first', 'under_way_first',
       'time_since_start', 'month', 'day', 'hour', 'minute'],
      dtype='object')
   vesselId                time  latitude  longitude    cog   sog  under_way  \
0       412 2024-05-08 00:03:16  31.14647  -81.49789  179.6   0.0          0   
1       373 2024-05-08 00:06:17  14.81694  120.29625   24.7   0.0          0   
2       181 2024-05-08 00:10:02  38.27895   10.78280    8.0  18.7          1   
3         8 2024-05-08 00:10:34 -43.53785  172.83522  321.3   0.1          0   
4        65 2024-05-08 00:12:27  48.53320   -6.12003  291.0   0.3          0   

   latitude_first  longitude_first  cog_first  sog_first  under_way_first  \
0        30.93466        -81.08673      159.0        0.0                0   
1        35.44624        139.71653       82.7        0.0                0   
2        38.00859         11.68428       40.0       18.7             

### Saving the trained model and corresponding scaler to files

In [26]:
import joblib

model.save('LSTM12.keras')  # Save the model
# Save the scaler
joblib.dump(scaler, 'scaler12.pkl')

['scaler12.pkl']

## Make predictions

In [27]:
# Make predictions
test_df_scaled = scaler.transform(test_df)
test_df_scaled = test_df_scaled.reshape((test_df_scaled.shape[0], 1, test_df_scaled.shape[1]))
predictions = model.predict(test_df_scaled)

print(predictions)
print(predictions.shape)

[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257us/step
[[  32.00753    -83.24888  ]
 [  12.202564   127.922905 ]
 [  43.05332      8.9681015]
 ...
 [  44.643593  -138.83044  ]
 [  51.104652    21.17188  ]
 [  54.434044    -2.3198915]]
(51739, 2)


In [28]:
# Create a DataFrame with the required format
predictions_df = pd.DataFrame(predictions, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = range(len(predictions_df))
predictions_df = predictions_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
predictions_df.to_csv('predictions_12.csv', index=False)