# AIS Trajectory

### Importere biblioteker

In [13]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import tensorflow_decision_forests as tfdf


ModuleNotFoundError: No module named 'tensorflow_decision_forests'

### Importer data

In [2]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')


# rename longitude and latitude to portLongitude and portLatitude
ports_df.rename(columns={'longitude': 'portLongitude', 'latitude': 'portLatitude'}, inplace=True)

# Merge port information into training data
train_df = train_df.merge(ports_df[["portId", "name", "portLocation", "portLongitude", "portLatitude", "UN_LOCODE", "countryName", "ISO"]], on='portId', how='left')

display(train_df.head())


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,name,portLocation,portLongitude,portLatitude,UN_LOCODE,countryName,ISO
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,Puerto San Antonio,San Antonio,-71.618889,-33.5875,CLSAI,Chile,CL
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,"Panamá, Ciudad de","Panamá, Ciudad de",-79.533,8.967,PAPTY,Panama,PA
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,Port of Baltimore,Baltimore,-76.558889,39.2325,USBAL,United States,US
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,Port of Port Kembla,Port Kembla,150.899444,-34.4625,AUPKL,Australia,AU
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,Tangier,Tangier,-5.817,35.783,MATNG,Morocco,MA


### Pre-prosessering

In [3]:
# Convert 'time' to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Ensure 'vesselId' is string
train_df['vesselId'] = train_df['vesselId'].astype(str)
test_df['vesselId'] = test_df['vesselId'].astype(str)

# Convert 'time' to numeric format
train_df['time_numeric'] = (train_df['time'] - train_df['time'].min()).dt.total_seconds()
test_df['time_numeric'] = (test_df['time'] - train_df['time'].min()).dt.total_seconds()


In [4]:
# Extract initial positions
initial_positions = train_df.groupby('vesselId').first().reset_index()[['vesselId', 'time', 'latitude', 'longitude']]
initial_positions.rename(columns={'time': 'initial_time', 'latitude': 'initial_latitude', 'longitude': 'initial_longitude'}, inplace=True)
test_df = test_df.merge(initial_positions, on='vesselId', how='left')

### Feature engineering

### Fit model

In [10]:
# Fit linear regression models for each vessel
lat_models = {}
lon_models = {}

vessels = train_df['vesselId'].unique()
for vessel in vessels:
    vessel_data = train_df[train_df['vesselId'] == vessel]
    
    if len(vessel_data) < 2:
        continue
    
    X = vessel_data[['time_numeric']]
    y_lat = vessel_data['latitude']
    y_lon = vessel_data['longitude']
    
    lat_model = xgb.XGBRegressor().fit(X, y_lat)
    lon_model = xgb.XGBRegressor().fit(X, y_lon)
    
    lat_models[vessel] = lat_model
    lon_models[vessel] = lon_model

# Handle vessels not in training data
vessels_in_train = set(lat_models.keys())
vessels_in_test = set(test_df['vesselId'].unique())
vessels_not_in_train = vessels_in_test - vessels_in_train

# Global models
global_lat_model = xgb.XGBRegressor().fit(train_df[['time_numeric']], train_df['latitude'])
global_lon_model = xgb.XGBRegressor().fit(train_df[['time_numeric']], train_df['longitude'])


### Predict

In [6]:
# Predict positions
lat_preds = []
lon_preds = []

for index, row in test_df.iterrows():
    vessel = row['vesselId']
    time_numeric = row['time_numeric']
    
    if vessel in lat_models:
        lat_model = lat_models[vessel]
        lon_model = lon_models[vessel]
    else:
        lat_model = global_lat_model
        lon_model = global_lon_model
    
    # Convert time_numeric to DataFrame with appropriate column name
    time_numeric_df = pd.DataFrame({'time_numeric': [time_numeric]})
    
    lat_pred = lat_model.predict(time_numeric_df)[0]
    lon_pred = lon_model.predict(time_numeric_df)[0]
    
    lat_preds.append(lat_pred)
    lon_preds.append(lon_pred)

test_df['latitude_predicted'] = lat_preds
test_df['longitude_predicted'] = lon_preds

### Eksporter til csv

In [7]:
submission = test_df[['ID', 'longitude_predicted', 'latitude_predicted']].copy()
submission.to_csv('predictions.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
