Importer biblioteker

In [286]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

Importer data

In [287]:
train_data = pd.read_csv('data/ais_train.csv', sep='|')
test_data = pd.read_csv('data/ais_test.csv')

Pre-prosessering

In [288]:
train_data['time'] = pd.to_datetime(train_data['time'])
test_data['time'] = pd.to_datetime(test_data['time'])

train_data['navstat_encoded'] = LabelEncoder().fit_transform(train_data['navstat'])
train_data['vesselId_encoded'] = LabelEncoder().fit_transform(train_data['vesselId'])
train_data['portId_encoded'] = LabelEncoder().fit_transform(train_data['portId'])

train_data['time'] = train_data.groupby('vesselId')['time'].transform(lambda x: (x - x.min()).dt.total_seconds())

train_data = train_data.drop(columns=['navstat', 'vesselId', 'portId'])

features = ['time', 'sog', 'cog', 'heading', 'rot', 'navstat_encoded', 'vesselId_encoded', 'portId_encoded']

x_train = train_data[features]

y_lat_train = train_data['latitude']
y_lon_train = train_data['longitude']

x_lat_train, x_lat_val, y_lat_train, y_lat_val = train_test_split(x_train, y_lat_train, test_size=0.5, random_state=42)
x_lon_train, x_lon_val, y_lon_train, y_lon_val = train_test_split(x_train, y_lon_train, test_size=0.5, random_state=42)

print('Training data:')
print(x_lat_train.head())


Training data:
              time  sog    cog  heading  rot  navstat_encoded  \
954763   6058801.0  0.0   28.8        2    0                5   
1101750  8133460.0  0.0  146.0      232    0                5   
1332982  9704559.0  0.0   15.9      194    0                5   
116730    111208.0  0.0  222.1       46    0                5   
702798   5098994.0  0.0   62.8      102    0                5   

         vesselId_encoded  portId_encoded  
954763                286              95  
1101750               415             732  
1332982               579              95  
116730                492             344  
702798                173             755  


Fit modell.

Her kan vi bare bytte ut og teste litt forskjellige modeller:

In [289]:
model_lat = HistGradientBoostingRegressor()
model_lon = HistGradientBoostingRegressor()

model_lat.fit(x_lat_train, y_lat_train)
model_lon.fit(x_lon_train, y_lon_train)

In [290]:
y_lat_pred = model_lat.predict(x_lat_val)
y_lon_pred = model_lon.predict(x_lon_val)

mse_lat = mean_squared_error(y_lat_val, y_lat_pred)
mse_lon = mean_squared_error(y_lon_val, y_lon_pred)

print('MSE latitude:', mse_lat)
print('MSE longitude:', mse_lon)

MSE latitude: 42.80817099152942
MSE longitude: 384.68164029041407


In [291]:
test_data["vesselId_encoded"] = LabelEncoder().fit_transform(test_data["vesselId"])

test_data["time"] = test_data.groupby('vesselId')['time'].transform(lambda x: (x - x.min()).dt.total_seconds())

merged_data = pd.merge(test_data, train_data, on=['vesselId_encoded', 'time'], how='left')

#display(merged_data.head(), merged_data.shape)


Prediker og skriv data til CSV-fil

In [292]:
x_test = merged_data[features]

y_lat_pred = model_lat.predict(x_test)
y_lon_pred = model_lon.predict(x_test)

predictions = pd.DataFrame({'ID': test_data['ID'], 'longitude_predicted': y_lon_pred, 'latitude_predicted': y_lat_pred})

predictions.to_csv('predictions.csv', index=False)