# AIS Trajectory

### Importere biblioteker

In [10]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb

# split sklearn
from sklearn.model_selection import train_test_split





### Importer data

In [8]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

#vessel0 = train_df['vesselId'][0]
original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

display(train_df.head())

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:5a4ecb1154f98dadf97f0ecddfec4d1662c...
1,size 180888959


### Pre-prosessering

In [11]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

# Drop columns that are not in the test data
train_df = train_df.drop(columns=['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'portId'])

# Label encode the vesselId
le = LabelEncoder()
train_df['vesselId'] = le.fit_transform(train_df['vesselId'])
test_df['vesselId'] = le.transform(test_df['vesselId'])

# Convert timestamps to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Convert time to features
train_df['hour'] = train_df['time'].dt.hour
train_df['weekday'] = train_df['time'].dt.weekday
train_df['month'] = train_df['time'].dt.month
train_df['day'] = train_df['time'].dt.day

test_df['hour'] = test_df['time'].dt.hour
test_df['weekday'] = test_df['time'].dt.weekday
test_df['month'] = test_df['time'].dt.month
test_df['day'] = test_df['time'].dt.day

# Drop the time column
train_df = train_df.drop(columns=['time'])
test_df = test_df.drop(columns=['time'])

display(train_df.head())
display(test_df.head())


KeyError: "['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'portId'] not found in axis"

### Feature engineering

### Fit model

In [67]:
features = ['vesselId', 'hour', 'weekday', 'month', 'day']
targets = ['latitude', 'longitude']

# split the data into train and validation 70% train, 30% validation with scikit-learn

x_train, x_val, y_train, y_val = train_test_split(train_df[features], train_df[targets])


model = xgb.XGBRegressor(
    n_estimators=2000,
    n_jobs=-1,
    early_stopping_rounds=50
)

model.fit(
    x_train, y_train,
    verbose=1,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    )



[0]	validation_0-rmse:50.25383	validation_1-rmse:50.26208
[1]	validation_0-rmse:48.87242	validation_1-rmse:48.89542
[2]	validation_0-rmse:48.15447	validation_1-rmse:48.18309
[3]	validation_0-rmse:47.64102	validation_1-rmse:47.66622
[4]	validation_0-rmse:47.34933	validation_1-rmse:47.37862
[5]	validation_0-rmse:46.96484	validation_1-rmse:47.00014
[6]	validation_0-rmse:46.65462	validation_1-rmse:46.69728
[7]	validation_0-rmse:46.28106	validation_1-rmse:46.32667
[8]	validation_0-rmse:46.20265	validation_1-rmse:46.24759
[9]	validation_0-rmse:45.99487	validation_1-rmse:46.03990
[10]	validation_0-rmse:45.72030	validation_1-rmse:45.76383
[11]	validation_0-rmse:45.52999	validation_1-rmse:45.57871
[12]	validation_0-rmse:45.06222	validation_1-rmse:45.11776
[13]	validation_0-rmse:44.80684	validation_1-rmse:44.86357
[14]	validation_0-rmse:44.62181	validation_1-rmse:44.68003
[15]	validation_0-rmse:44.52862	validation_1-rmse:44.58765
[16]	validation_0-rmse:44.45347	validation_1-rmse:44.51198
[17]	va

### Predict

In [68]:
# Give metrics on the model
print(model.score(x_train, y_train))
print(model.score(x_val, y_val))


0.7419241666793823
0.7201257944107056


In [69]:
# Predict positions
preds = model.predict(test_df[features])

In [70]:
print(preds)
test_df['latitude_predicted'] = preds[:,0]
test_df['longitude_predicted'] = preds[:,1]


display(test_df.head())

[[  41.22424    17.658068]
 [  24.379225 -116.85002 ]
 [  38.851208   17.646578]
 ...
 [  35.527046   18.27786 ]
 [  53.916546   13.415274]
 [  24.61571   -16.42321 ]]


Unnamed: 0,ID,vesselId,scaling_factor,hour,weekday,month,day,latitude_predicted,longitude_predicted
0,0,84,0.3,0,2,5,8,41.224239,17.658068
1,1,623,0.3,0,2,5,8,24.379225,-116.850021
2,2,596,0.3,0,2,5,8,38.851208,17.646578
3,3,542,0.3,0,2,5,8,32.607624,76.815704
4,4,1,0.3,0,2,5,8,35.132519,14.372966


### Eksporter til csv

In [71]:
submission_df = test_df[['ID', 'longitude_predicted','latitude_predicted']]

submission_df.to_csv('predictions_xgb.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
