In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from preprocess_and_feature_engineering import feature_engineering, preprocess 

In [3]:
# Read data sets
X_train = pd.read_csv('ais_train.csv', sep='|')
X_test = pd.read_csv('ais_test.csv')

In [4]:
X_train,X_test = preprocess(X_train,X_test)
X_train,X_test = feature_engineering(X_train,X_test)

In [5]:
X_train.head()

Unnamed: 0,cog,sog,rot,heading,latitude_lag1,longitude_lag1,navstat,latitude,longitude,vesselId,...,etaMonth,etaDay,etaHour,etaMinute,timeYear,timeMonth,timeDay,timeHour,timeMinute,timeSecond
0,284.0,0.7,0,88,,,0,-34.7437,-57.8513,0,...,1,9,23,0,24,1,1,0,0,25
1,109.6,0.0,-6,347,-34.7437,-57.8513,1,8.8944,-79.47939,1,...,12,29,20,0,24,1,1,0,0,36
2,111.0,11.0,0,112,8.8944,-79.47939,0,39.19065,-76.47567,2,...,1,2,9,0,24,1,1,0,1,45
3,96.4,0.0,0,142,39.19065,-76.47567,1,-34.41189,151.02067,3,...,12,31,20,0,24,1,1,0,3,11
4,214.0,19.7,0,215,-34.41189,151.02067,0,35.88379,-5.91636,4,...,1,25,12,0,24,1,1,0,3,51


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1521648 entries, 0 to 1522064
Data columns (total 23 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   cog             1521648 non-null  float64
 1   sog             1521648 non-null  float64
 2   rot             1521648 non-null  int64  
 3   heading         1521648 non-null  int64  
 4   latitude_lag1   1521647 non-null  float64
 5   longitude_lag1  1521647 non-null  float64
 6   navstat         1521648 non-null  int64  
 7   latitude        1521648 non-null  float64
 8   longitude       1521648 non-null  float64
 9   vesselId        1521648 non-null  int64  
 10  portId          1521648 non-null  int64  
 11  not_under_way   1521648 non-null  int64  
 12  under_way       1521648 non-null  int64  
 13  etaMonth        1521648 non-null  int32  
 14  etaDay          1521648 non-null  int32  
 15  etaHour         1521648 non-null  int32  
 16  etaMinute       1521648 non-null  int32  

In [7]:
X_test.head()

Unnamed: 0,ID,vesselId,scaling_factor,timeYear,timeMonth,timeDay,timeHour,timeMinute,timeSecond
0,0,412,0.3,24,5,8,0,3,16
1,1,373,0.3,24,5,8,0,6,17
2,2,181,0.3,24,5,8,0,10,2
3,3,8,0.3,24,5,8,0,10,34
4,4,65,0.3,24,5,8,0,12,27


In [8]:
# Create the trianing inputs
y_train = X_train[['longitude', 'latitude']]
X_train.drop(columns=['longitude', 'latitude'],inplace=True)

In [9]:
# Initialize the model
#rf = RandomForestRegressor(n=-1)

# Train the model
#rf.fit(X_train, y_train)

In [10]:
# Extract the scaling factor from the test data and store it separately
scaling_factor = X_test['scaling_factor'].values  # Extract scaling_factor before prediction

# Identify the columns in X_train that are missing in test
missing_cols = set(X_train.columns) - set(X_test.columns)

# Add missing columns to test, with default values of 0 (or NaN if appropriate)
for col in missing_cols:
    X_test[col] = 0  # Or use NaN if that makes more sense

# Remove extra columns from test that aren't in X_train
extra_cols = set(X_test.columns) - set(X_train.columns)
X_test.drop(columns=extra_cols, inplace=True)

# Now reorder test columns to match the order of X_train columns
X_test = X_test[X_train.columns]  # Rearrange columns in test to match X

# Verify the column order
print("Test columns (after reordering):", X_test.columns)
print("X columns:", X_train.columns)

print(X_test.head())

latitude_lag1
etaMonth
longitude_lag1
rot
etaMinute
not_under_way
heading
portId
navstat
under_way
cog
etaDay
etaHour
sog
Test columns (after reordering): Index(['cog', 'sog', 'rot', 'heading', 'latitude_lag1', 'longitude_lag1',
       'navstat', 'vesselId', 'portId', 'not_under_way', 'under_way',
       'etaMonth', 'etaDay', 'etaHour', 'etaMinute', 'timeYear', 'timeMonth',
       'timeDay', 'timeHour', 'timeMinute', 'timeSecond'],
      dtype='object')
X columns: Index(['cog', 'sog', 'rot', 'heading', 'latitude_lag1', 'longitude_lag1',
       'navstat', 'vesselId', 'portId', 'not_under_way', 'under_way',
       'etaMonth', 'etaDay', 'etaHour', 'etaMinute', 'timeYear', 'timeMonth',
       'timeDay', 'timeHour', 'timeMinute', 'timeSecond'],
      dtype='object')
   cog  sog  rot  heading  latitude_lag1  longitude_lag1  navstat  vesselId  \
0    0    0    0        0              0               0        0       412   
1    0    0    0        0              0               0        0     

In [11]:
# Predict using the Random Forest model (no scaling factor in test anymore)
predictions = rf.predict(X_test)

# Apply the scaling factor to the predictions
predictions_scaled = predictions * scaling_factor[:, None]  # Apply scaling factor to each prediction

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(predictions_scaled, columns=['latitude_predicted', 'longitude_predicted'])
predictions_df['ID'] = X_test.index  # Assuming ID is the index or you can use test['ID']

# Save the predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False, columns=['ID', 'longitude_predicted', 'latitude_predicted'])

NameError: name 'rf' is not defined