## Results

No of records : 588   
Ship Type : RoRo Cargo Ship Vehicle Carrier    
Training Accuracy : 0.8617  
Testing Accuracy : 0.9547   

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df_train = pd.read_excel('Data_2024/ShipType9.xlsx')
df_test = pd.read_excel('Data_2025/ShipType9.xlsx')

pd.set_option('display.max_columns', None)
df_train.columns = df_train.columns.str.replace(' ', '_')
df_train = df_train.fillna(0)
df_train = df_train.loc[:, ~df_train.columns.duplicated()]

df_test.columns = df_test.columns.str.replace(' ', '_')
df_test = df_test.fillna(0)
df_test = df_test.loc[:, ~df_test.columns.duplicated()]

In [3]:
df_train = df_train.select_dtypes(include=['float64', 'int64'])

corr = df_train.corr()['TOTALCO2EMISSION'].sort_values(ascending=False)
corr.head(30)

TOTALCO2EMISSION        1.000000
Distance                0.926036
ME_RunningHour          0.916749
AVGSPEED                0.856661
REQUIRED_CII            0.723094
CII_REFERENCE           0.723094
FCPH                    0.662509
swell_height            0.604878
FCPD_YTD                0.588985
wind_bft                0.545365
wave_height             0.533383
AE_HFO                  0.422745
draft_fwd               0.411239
draft_aft               0.410951
draft_mid               0.396451
ME_HFO                  0.360362
wave_dir                0.275496
TOTALCO2EMISSION_YTD    0.265819
AE_Boiler_HFO           0.256270
AE_LFO                  0.211485
ME_LFO                  0.176553
FCPD                    0.165878
ME_MDO                  0.148295
wind_dir                0.145709
AE_Boiler_LFO           0.128097
ROB_HFO                 0.120207
ROB_MDO                 0.111206
AVGSPEED_YTD            0.068523
ROB_LFO                 0.066025
REQUIRED_CII_YTD        0.045551
Name: TOTA

In [4]:
df_train.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_train = df_train[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_train = df_train['TOTALCO2EMISSION']

In [5]:
df_test.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_test = df_test[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_test = df_test['TOTALCO2EMISSION']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape, y_train.shape)
print("Test set size:", X_test.shape, y_test.shape)

Training set size: (470, 10) (470,)
Test set size: (118, 10) (118,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
models = [
    ('LR', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('L_SVR', LinearSVR()),
    ('DT', DecisionTreeRegressor()),
    ('ADA', AdaBoostRegressor()),
    ('RF', RandomForestRegressor()),
    ('GB', GradientBoostingRegressor()),
    ('XGB', XGBRegressor())
]

results = []

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute R² scores
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Compute MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Compute RMSE
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    # Append results as a new row
    results.append([name, train_r2, test_r2, train_mse, train_rmse, test_mse, test_rmse])

# Create DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Train R² Score', 'Test R² Score', 'Train MSE', 'Train RMSE', 'Test MSE', 'Test RMSE'])

# Print the results
print(results_df)



   Model  Train R² Score  Test R² Score   Train MSE  Train RMSE    Test MSE  \
0     LR        0.861746       0.954776  251.391993   15.855346   68.357487   
1  Ridge        0.861737       0.954721  251.409520   15.855899   68.441159   
2  Lasso        0.858307       0.952510  257.646716   16.051377   71.783300   
3  L_SVR        0.832117       0.910572  305.267571   17.471908  135.173613   
4     DT        0.980790       0.771861   34.930026    5.910163  344.838921   
5    ADA        0.880196       0.926960  217.844645   14.759561  110.402775   
6     RF        0.963587       0.889328   66.210429    8.136979  167.283985   
7     GB        0.946608       0.912942   97.084093    9.853126  131.590509   
8    XGB        0.980042       0.842245   36.289831    6.024104  238.451833   

   Test RMSE  
0   8.267859  
1   8.272917  
2   8.472503  
3  11.626419  
4  18.569839  
5  10.507272  
6  12.933831  
7  11.471291  
8  15.441886  


In [9]:
import optuna

def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-5, 100.0, log=True)

    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)  # RMSE manually

    return rmse

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Best result
print("✅ Best Hyperparameters:", study.best_params)
print("🏆 Best RMSE:", study.best_value)

[I 2025-04-16 14:59:10,042] A new study created in memory with name: no-name-5471c632-3822-48b2-b96d-253d869409f6
[I 2025-04-16 14:59:10,062] Trial 0 finished with value: 8.316763874535063 and parameters: {'alpha': 12.089371897062923}. Best is trial 0 with value: 8.316763874535063.
[I 2025-04-16 14:59:10,065] Trial 1 finished with value: 8.267859017857116 and parameters: {'alpha': 6.441247500440296e-05}. Best is trial 1 with value: 8.267859017857116.
[I 2025-04-16 14:59:10,074] Trial 2 finished with value: 8.26804340022604 and parameters: {'alpha': 0.03532046680002032}. Best is trial 1 with value: 8.267859017857116.
[I 2025-04-16 14:59:10,081] Trial 3 finished with value: 8.26785933820112 and parameters: {'alpha': 0.000125580967270651}. Best is trial 1 with value: 8.267859017857116.
[I 2025-04-16 14:59:10,085] Trial 4 finished with value: 8.26793751690904 and parameters: {'alpha': 0.015062378036164309}. Best is trial 1 with value: 8.267859017857116.
[I 2025-04-16 14:59:10,095] Trial 5 

✅ Best Hyperparameters: {'alpha': 1.0051003040153852e-05}
🏆 Best RMSE: 8.267858733160697


In [10]:
best_alpha = 1.0051003040153852e-05
best_ridge = Ridge(alpha=best_alpha)
best_ridge.fit(X_train, y_train)

In [11]:
train_preds = best_ridge.predict(X_train)
test_preds = best_ridge.predict(X_test)

print("\n📊 Final Model Evaluation:")
print("Train R² Score:", r2_score(y_train, train_preds))
print("Test R² Score:", r2_score(y_test, test_preds))
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, train_preds)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_preds)))


📊 Final Model Evaluation:
Train R² Score: 0.8617464570465224
Test R² Score: 0.9547760500683877
Train RMSE: 15.85534588508788
Test RMSE: 8.267858733160697


In [12]:
import joblib

joblib.dump(best_ridge, 'Models/ship_model_9.joblib')

['Models/ship_model_9.joblib']