# AIS Trajectory

### Importere biblioteker

In [9]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm


### Importer data

In [10]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

vessel0 = train_df['vesselId'][0]
original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

### Pre-prosessering

In [11]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

# Convert 'time' to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Ensure 'vesselId' is string
train_df['vesselId'] = train_df['vesselId'].astype(str)
test_df['vesselId'] = test_df['vesselId'].astype(str)

# Convert 'time' to numeric format
train_df['time_numeric'] = (train_df['time'] - train_df['time'].min()).dt.total_seconds()
test_df['time_numeric'] = (test_df['time'] - train_df['time'].min()).dt.total_seconds()

# Convert 'portId' to numeric format
le = LabelEncoder()
le.fit(train_df['portId'])
train_df['portId'] = le.transform(train_df['portId'])

# Convert 'vesselId' to numeric format
le_vesselId = LabelEncoder()
le_vesselId.fit(train_df['vesselId'])
train_df['vesselId'] = le_vesselId.transform(train_df['vesselId'])
test_df['vesselId'] = le_vesselId.transform(test_df['vesselId']) 

train_df['etaRaw'] = train_df['etaRaw'].dropna()

train_df['etaRaw'] = pd.to_datetime(train_df['etaRaw'], format='%m-%d %H:%M', errors='coerce')
train_df['etaRaw'] = train_df['etaRaw'].apply(lambda x: x.replace(year=2024) if pd.notnull(x) else x)

train_df['etaRaw_numeric'] = (train_df['etaRaw'] - train_df['time'].min()).dt.total_seconds()

# drop etaRaw og time
train_df = train_df.drop(columns=['etaRaw', 'time'])

# Drop Nan values
train_df = train_df.dropna()    

display(train_df.head())


Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,portId,time_numeric,etaRaw_numeric
0,284.0,0.7,0,88,0,-34.7437,-57.8513,50,40,0.0,773975.0
1,109.6,0.0,-6,347,1,8.8944,-79.47939,189,674,11.0,31435175.0
2,111.0,11.0,0,112,0,39.19065,-76.47567,432,353,80.0,118775.0
3,96.4,0.0,0,142,1,-34.41189,151.02067,110,18,166.0,31607975.0
4,214.0,19.7,0,215,0,35.88379,-5.91636,356,605,206.0,2116775.0


### Feature engineering

### Fit model

In [12]:
# Fit linear regression models for each vessel
lat_models = {}
lon_models = {}

vessels = train_df['vesselId'].unique()
for vessel in vessels:
    vessel_data = train_df[train_df['vesselId'] == vessel]
    
    if len(vessel_data) < 2:
        continue
    
    X = vessel_data[['time_numeric']]
    y_lat = vessel_data['latitude']
    y_lon = vessel_data['longitude']
    
    lat_model = xgb.XGBRegressor().fit(X, y_lat)
    lon_model = xgb.XGBRegressor().fit(X, y_lon)
    
    lat_models[vessel] = lat_model
    lon_models[vessel] = lon_model

# Handle vessels not in training data
vessels_in_train = set(lat_models.keys())
vessels_in_test = set(test_df['vesselId'].unique())
vessels_not_in_train = vessels_in_test - vessels_in_train

# Global models
global_lat_model = xgb.XGBRegressor().fit(train_df[['time_numeric']], train_df['latitude'])
global_lon_model = xgb.XGBRegressor().fit(train_df[['time_numeric']], train_df['longitude'])

### Predict

In [13]:
# Predict positions with a progress bar
lat_preds = []
lon_preds = []

# Wrap the iterrows() loop with tqdm to add a progress bar
for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting Positions"):
    vessel = row['vesselId']
    time_numeric = row['time_numeric']
    
    if vessel in lat_models:
        lat_model = lat_models[vessel]
        lon_model = lon_models[vessel]
    else:
        lat_model = global_lat_model
        lon_model = global_lon_model
    
    # Convert time_numeric to DataFrame with appropriate column name
    time_numeric_df = pd.DataFrame({'time_numeric': [time_numeric]})
    
    lat_pred = lat_model.predict(time_numeric_df)[0]
    lon_pred = lon_model.predict(time_numeric_df)[0]
    
    lat_preds.append(lat_pred)
    lon_preds.append(lon_pred)

# Store predictions in test_df
test_df['latitude_predicted'] = lat_preds
test_df['longitude_predicted'] = lon_preds




Predicting Positions: 100%|██████████| 51739/51739 [03:15<00:00, 264.22it/s]


 ID  longitude_predicted  latitude_predicted
0   0           -13.511171           41.366013
1   1           -17.383448           19.439676
2   2            -3.541152           34.733105
3   3            71.226295            5.466014
4   4            -9.340018           48.589073

### Eksporter til csv

In [16]:
test_df.to_csv('predictions.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
