# Feature engineering 

In [5]:
import pandas as pd
from geopy.distance import geodesic

In [6]:
ais_train = pd.read_csv('ais_train.csv', sep='|')
ports = pd.read_csv('ports.csv', sep='|')
vessels = pd.read_csv('vessels.csv', sep='|')

In [7]:
def preprocess(ais_train, ports, vessels):
    ais_data = ais_train.copy()
    ports_data = ports.copy()
    vessels_data = vessels.copy()

    # Remove the rows in ais_data that does not contain a portId
    ais_data = ais_data[ais_data['portId'].notnull()]

    # Removing unwanted columns
    ais_data.drop(columns=['heading'], inplace=True)
    ports_data.drop(columns=['name', 'portLocation', 'UN_LOCODE', 'countryName', 'ISO'], inplace=True)
    vessels_data.drop(columns=['DWT', 'NT', 'vesselType', 'depth', 'draft', 'enginePower', 'freshWater', 'fuel', 'maxHeight', 'maxSpeed', 'maxWidth', 'rampCapacity', 'yearBuilt'], inplace=True)

    # Merging ais_data with ports_data on portId
    data = pd.merge(ais_data, ports_data, on='portId', how='left')

    # Merging data with vessels_data on vesselId
    data = pd.merge(data, vessels_data, on='vesselId', how='left')

    # Ensure 'etaRaw' and 'time' columns are strings
    data['etaRaw'] = data['etaRaw'].astype(str)
    data['time'] = data['time'].astype(str)

    # Extract calendar features for 'etaRaw'
    data[['etaMonth', 'etaDay', 'etaHour', 'etaMinute']] = data['etaRaw'].str.extract(r'(\d{2})-(\d{2}) (\d{2}):(\d{2})')

    # Extract calendar features for 'time'
    data[['timeYear', 'timeMonth', 'timeDay', 'timeHour', 'timeMinute', 'timeSecond']] = data['time'].str.extract(r'(\d{2})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})')

    # Convert objects to integers
    data[['etaMonth', 'etaDay', 'etaHour', 'etaMinute', 'timeYear', 'timeMonth', 'timeDay', 'timeHour', 'timeMinute', 'timeSecond']] = data[['etaMonth', 'etaDay', 'etaHour', 'etaMinute', 'timeYear', 'timeMonth', 'timeDay', 'timeHour', 'timeMinute', 'timeSecond']].astype(int)

    # Drop time and etaRaw columns
    data.drop(columns=['time', 'etaRaw'], inplace=True)

    # Removing sog outliers 
    data = data[data['sog'] <= 40]
    
    return data

data = preprocess(ais_train, ports, vessels)
print(data.head())


     cog   sog  rot  navstat  latitude_x  longitude_x  \
0  284.0   0.7    0        0   -34.74370    -57.85130   
1  109.6   0.0   -6        1     8.89440    -79.47939   
2  111.0  11.0    0        0    39.19065    -76.47567   
3   96.4   0.0    0        1   -34.41189    151.02067   
4  214.0  19.7    0        0    35.88379     -5.91636   

                   vesselId                    portId  longitude_y  \
0  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   -71.618889   
1  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689   -79.533000   
2  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19   -76.558889   
3  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126   150.899444   
4  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3    -5.817000   

   latitude_y  ... etaMonth  etaDay  etaHour  etaMinute timeYear  timeMonth  \
0    -33.5875  ...        1       9       23          0       24          1   
1      8.9670  ...       12      29       20          0       24          1   

In [8]:
def feature_engineering(df):
    df = df.copy()

    # Add NAVSTAT info
    df['is_moored'] = df['navstat'].apply(lambda x: 1 if x == 5 else 0)
    df['is_anchor'] = df['navstat'].apply(lambda x: 1 if x == 1 else 0)
    df['is_moving'] = df['navstat'].apply(lambda x: 1 if x == 0 else 0)

    df.drop(columns=['navstat'], inplace=True)
    
    # Group by vesselId and apply shift
    df['latitude_shifted'] = df.groupby('vesselId')['latitude_x'].shift()
    df['longitude_shifted'] = df.groupby('vesselId')['longitude_x'].shift()

    # Drop rows with NaN values in latitude and longitude columns
    df.dropna(subset=['latitude_x', 'longitude_x', 'latitude_y', 'longitude_y', 'latitude_shifted', 'longitude_shifted'], inplace=True)

    # Calculate the distance between the current and previous location
    df['distance'] = df.apply(lambda x: geodesic((x['latitude_x'], x['longitude_x']), (x['latitude_shifted'], x['longitude_shifted'])).meters, axis=1)

    # Drop the shifted columns
    df.drop(columns=['latitude_shifted', 'longitude_shifted'], inplace=True)

    # Add distance to port
    df['distance_to_port'] = df.apply(lambda x: geodesic((x['latitude_x'], x['longitude_x']), (x['latitude_y'], x['longitude_y'])).meters, axis=1)

    return df

features = feature_engineering(data)
print(features.head())


       cog   sog  rot  latitude_x  longitude_x                   vesselId  \
142   73.9   0.1   12   -26.77586    153.23453   61e9f3acb937134a3c4bfe23   
144  150.7   0.1    0    38.47387     15.91592   61e9f3cab937134a3c4bff01   
145   56.4  12.9  -16    33.90815    130.92404   61e9f3e1b937134a3c4bff59   
147  114.5   0.0    0    43.44237     -3.82309  clh6aqawa0002gh0zypfa5dut   
148  210.4   0.0   -5    40.68658     29.31613   61e9f3c7b937134a3c4bfedf   

                       portId  longitude_y  latitude_y  \
142  61d36f640a1807568ff9a103   153.169444  -27.382500   
144  61d3781393c6feb83e5eb73d    15.904444   38.456667   
145  61d37a591366c3998241d986   126.383056   34.776111   
147  61d37fb929b60f6113c89ea0    -3.807778   43.442222   
148  61d38259b7b7526e1adf3a41    29.841944   40.751111   

               shippingLineId  ...  timeMonth  timeDay  timeHour timeMinute  \
142  61a8e672f9cba188601e84ab  ...          1        1         0         17   
144  61ec6303a8cafc0e93f0e8f3 

In [9]:
ais_test = pd.read_csv('ais_test.csv', sep='|')

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Prepare the training data
X = features.drop(columns=['latitude_x', 'longitude_x'])
y_lat = features['latitude_x']
y_lon = features['longitude_x']

# Split the data into training and validation sets
X_train, X_val, y_lat_train, y_lat_val, y_lon_train, y_lon_val = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Train the RandomForestRegressor model for latitude
model_lat = RandomForestRegressor(n_estimators=100, random_state=42)
model_lat.fit(X_train, y_lat_train)

# Train the RandomForestRegressor model for longitude
model_lon = RandomForestRegressor(n_estimators=100, random_state=42)
model_lon.fit(X_train, y_lon_train)

# Make predictions on the validation set
y_lat_pred = model_lat.predict(X_val)
y_lon_pred = model_lon.predict(X_val)

# Calculate the mean squared error for validation
mse_lat = mean_squared_error(y_lat_val, y_lat_pred)
mse_lon = mean_squared_error(y_lon_val, y_lon_pred)
print(f'Mean Squared Error for Latitude: {mse_lat}')
print(f'Mean Squared Error for Longitude: {mse_lon}')

# Prepare the test data
X_test = ais_test.drop(columns=['ID', 'vesselId', 'time', 'scaling_factor'])

# Make predictions on the test set
ais_test['predicted_latitude'] = model_lat.predict(X_test)
ais_test['predicted_longitude'] = model_lon.predict(X_test)

print(ais_test[['ID', 'predicted_latitude', 'predicted_longitude']])

ValueError: could not convert string to float: '61e9f462b937134a3c4c0251'