# AIS Trajectory

### Importere biblioteker

In [24]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from tqdm import tqdm


### Importer data

In [43]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

vessel0 = train_df['vesselId'][0]
original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

### Pre-prosessering

In [69]:
train_df = pd.read_csv('data/ais_train.csv', sep='|')
test_df = pd.read_csv('data/ais_test.csv')

ports_df = pd.read_csv('data/ports.csv', sep='|')  
schedule_df = pd.read_csv('data/schedules_to_may_2024.csv', sep='|')
vessels_df = pd.read_csv('data/vessels.csv', sep='|')

original_train_df = train_df.copy(deep = True)
original_test_df = test_df.copy(deep = True)

# Convert 'time' to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Ensure 'vesselId' is string
train_df['vesselId'] = train_df['vesselId'].astype(str)
test_df['vesselId'] = test_df['vesselId'].astype(str)

# Convert 'time' to numeric format
train_df['time_numeric'] = (train_df['time'] - train_df['time'].min()).dt.total_seconds()
test_df['time_numeric'] = (test_df['time'] - train_df['time'].min()).dt.total_seconds()

# Convert 'portId' to numeric format
le = LabelEncoder()
le.fit(train_df['portId'])
train_df['portId'] = le.transform(train_df['portId'])

# Convert 'vesselId' to numeric format
le_vesselId = LabelEncoder()
le_vesselId.fit(vessels_df['vesselId'])
train_df['vesselId'] = le_vesselId.transform(train_df['vesselId'])
test_df['vesselId'] = le_vesselId.transform(test_df['vesselId']) 
vessels_df['vesselId'] = le_vesselId.transform(vessels_df['vesselId'])

#Convert shippingline ID to numeric format
le_shippingLineId = LabelEncoder()
le_shippingLineId.fit(vessels_df['shippingLineId'])
vessels_df['shippingLineId'] = le_shippingLineId.transform(vessels_df['shippingLineId'])

#Convert homeport to numeric format
le_homePort = LabelEncoder()
le_homePort.fit(vessels_df['homePort'])
vessels_df['homePort'] = le_homePort.transform(vessels_df['homePort'])


#Add etaRaw to train_df
train_df['etaRaw'] = train_df['etaRaw'].dropna()
train_df['etaRaw'] = pd.to_datetime(train_df['etaRaw'], format='%m-%d %H:%M', errors='coerce')
train_df['etaRaw'] = train_df['etaRaw'].apply(lambda x: x.replace(year=2024) if pd.notnull(x) else x)
train_df['etaRaw_numeric'] = (train_df['etaRaw'] - train_df['time'].min()).dt.total_seconds()

# drop etaRaw og time
train_df = train_df.drop(columns=['etaRaw', 'time'])

# Drop Nan values
train_df = train_df.dropna() 


display(train_df.head())


Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,portId,time_numeric,etaRaw_numeric
0,284.0,0.7,0,88,0,-34.7437,-57.8513,51,40,0.0,773975.0
1,109.6,0.0,-6,347,1,8.8944,-79.47939,198,674,11.0,31435175.0
2,111.0,11.0,0,112,0,39.19065,-76.47567,450,353,80.0,118775.0
3,96.4,0.0,0,142,1,-34.41189,151.02067,114,18,166.0,31607975.0
4,214.0,19.7,0,215,0,35.88379,-5.91636,370,605,206.0,2116775.0


In [70]:
#Filling in missing values in each column with the median of the column
for column in vessels_df.select_dtypes(include=['float64', 'int64']).columns:
    median_value = vessels_df[column].median()
    vessels_df[column].fillna(median_value, inplace=True)


vessels_df.to_csv('cleaned_vessels_df.csv', index=False)

new_test_df = pd.merge(test_df, vessels_df, on='vesselId', how='left')

#display(new_test_df.head())

### Feature engineering

### Fit model

In [72]:
import pandas as pd
from tqdm import tqdm
import xgboost as xgb

# Fit linear regression models for each vessel
lat_models = {}
lon_models = {}

# Get unique vessel IDs from the training DataFrame
vessels = train_df['vesselId'].unique()

for vessel in tqdm(vessels, desc="Fitting models for vessels"):
    # Filter the training data for the current vessel and merge with vessel information
    vessel_data = pd.merge(train_df[train_df['vesselId'] == vessel][['time_numeric', 'vesselId', 'latitude','longitude']], vessels_df, how='left', on='vesselId')
    if len(vessel_data) < 2:
        continue  # Skip if there's not enough data for fitting
    
    X = vessel_data.drop(['latitude', 'longitude'], axis=1)
    y_lat = vessel_data['latitude']
    y_lon = vessel_data['longitude']
    
    # Fit the models for latitude and longitude
    lat_model = xgb.XGBRegressor().fit(X, y_lat)
    lon_model = xgb.XGBRegressor().fit(X, y_lon)
    
    # Store the models for future predictions
    lat_models[vessel] = lat_model
    lon_models[vessel] = lon_model




Fitting models for vessels: 100%|██████████| 688/688 [02:24<00:00,  4.77it/s]


In [29]:
# Markus sin
lat_pred = []
long_pred = []
predictions = []
last_position = {}

for index, row in test_df.iterrows():
    vessel = row['vesselId']
    time_numeric = row['time_numeric']
    if(vessel in last_position):
        sog,lat,long = last_position[vessel]
    else: 
        long = np.array(train_df[train_df['vesselId']  == vessel]['longitude'])[-1]
        lat = np.array(train_df[train_df['vesselId']  == vessel]['latitude'])[-1]
        sog = np.array(train_df[train_df['vesselId']  == vessel]['sog'])[-1]

    X_data = {'sog': sog, 'latitude':lat, 'longitude':long, 'time_numeric':time_numeric}
    X = pd.DataFrame(data = X_data, index = [0])
    display(X.head())

    lat_model = lat_models[vessel]
    lon_model = lon_models[vessel]
    lat_prediction = lat_model.predict(X)
    lon_prediction = lon_model.predict(X)
    

Unnamed: 0,sog,latitude,longitude,time_numeric
0,0.0,31.14647,-81.49789,11059371.0


ValueError: feature_names mismatch: ['time_numeric'] ['sog', 'latitude', 'longitude', 'time_numeric']
training data did not have the following fields: sog, latitude, longitude

### Predict

In [54]:
# Predict positions with a progress bar
lat_preds = []
lon_preds = []

# Wrap the iterrows() loop with tqdm to add a progress bar
for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting Positions"):
    vessel = row['vesselId']
    time_numeric = row['time_numeric']
    
    lat_model = lat_models[vessel]
    lon_model = lon_models[vessel]
    
    # Convert time_numeric to DataFrame with appropriate column name
    time_numeric_df = pd.DataFrame({'time_numeric': [time_numeric]})
    
    lat_pred = lat_model.predict(time_numeric_df)[0]
    lon_pred = lon_model.predict(time_numeric_df)[0]
    
    lat_preds.append(lat_pred)
    lon_preds.append(lon_pred)

# Store predictions in test_df
test_df['latitude_predicted'] = lat_preds
test_df['longitude_predicted'] = lon_preds




Predicting Positions:   0%|          | 0/51739 [00:00<?, ?it/s]


ValueError: feature_names mismatch: ['cog', 'sog', 'rot', 'heading', 'navstat', 'latitude', 'longitude', 'vesselId', 'portId', 'time_numeric', 'etaRaw_numeric'] ['time_numeric']
expected navstat, etaRaw_numeric, latitude, rot, heading, portId, longitude, sog, vesselId, cog in input data

 ID  longitude_predicted  latitude_predicted
0   0           -13.511171           41.366013
1   1           -17.383448           19.439676
2   2            -3.541152           34.733105
3   3            71.226295            5.466014
4   4            -9.340018           48.589073

### Eksporter til csv

In [7]:
submission = test_df[['ID', 'longitude_predicted', 'latitude_predicted']].copy()
submission.to_csv('predictions.csv', index=False)
print('Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker')

Få den jævla "predictions.csv" filen inn på Kaggle og se om det funker
