## Results

No of records : 8571   
Ship Type : Container Ship    
Training Accuracy : 0.9655                    
Testing Accuracy: 0.9632     

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df_train = pd.read_excel('Data_2024/ShipType4.xlsx')
df_test = pd.read_excel('Data_2025/ShipType4.xlsx')

pd.set_option('display.max_columns', None)
df_train.columns = df_train.columns.str.replace(' ', '_')
df_train = df_train.fillna(0)
df_train = df_train.loc[:, ~df_train.columns.duplicated()]

df_test.columns = df_test.columns.str.replace(' ', '_')
df_test = df_test.fillna(0)
df_test = df_test.loc[:, ~df_test.columns.duplicated()]

In [3]:
df_train = df_train.select_dtypes(include=['float64', 'int64'])

corr = df_train.corr()['TOTALCO2EMISSION'].sort_values(ascending=False)
corr.head(30)

TOTALCO2EMISSION              1.000000
Distance                      0.964527
ME_RunningHour                0.902974
AE_HFO                        0.794072
FCPH                          0.784834
AVGSPEED                      0.772352
ME_HFO                        0.537782
swell_height                  0.498566
FCPH_YTD                      0.493087
wave_height                   0.489075
AVGSPEED_YTD                  0.484494
FLAGSTATE                     0.475366
FCPD_YTD                      0.467817
IMONUMBER                     0.456063
GrossTonnage                  0.455850
DeadWeight                    0.454322
AE_Boiler_HFO                 0.397118
wind_bft                      0.384400
ROB_HFO                       0.353806
CLASSID                       0.335785
ISIMODCS_SEEM2UPDATE          0.324568
Displacement                  0.283642
TOTALCO2EMISSION_YTD          0.279406
ISIMODCS_CLASS_SEEM2UPDATE    0.239019
ISIMODCS_SEEM3UPDATE          0.202565
CII_REFERENCE            

In [4]:
df_train.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_train = df_train[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_train = df_train['TOTALCO2EMISSION']

In [5]:
df_test.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_test = df_test[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_test = df_test['TOTALCO2EMISSION']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape, y_train.shape)
print("Test set size:", X_test.shape, y_test.shape)

Training set size: (6856, 10) (6856,)
Test set size: (1715, 10) (1715,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
models = [
    ('LR', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('L_SVR', LinearSVR()),
    ('DT', DecisionTreeRegressor()),
    ('ADA', AdaBoostRegressor()),
    ('RF', RandomForestRegressor()),
    ('GB', GradientBoostingRegressor()),
    ('XGB', XGBRegressor())
]

results = []

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute R² scores
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Compute MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Compute RMSE
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    # Append results as a new row
    results.append([name, train_r2, test_r2, train_mse, train_rmse, test_mse, test_rmse])

# Create DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Train R² Score', 'Test R² Score', 'Train MSE', 'Train RMSE', 'Test MSE', 'Test RMSE'])

# Print the results
print(results_df)

  model = cd_fast.enet_coordinate_descent(


   Model  Train R² Score  Test R² Score   Train MSE  Train RMSE    Test MSE  \
0     LR        0.947536       0.953915  194.981987   13.963595  166.422124   
1  Ridge        0.947536       0.953905  194.982137   13.963600  166.457152   
2  Lasso        0.945555       0.949680  202.346504   14.224855  181.714192   
3  L_SVR        0.869981       0.882573  483.214984   21.982151  424.048876   
4     DT        0.989832       0.937218   37.788627    6.147245  226.717526   
5    ADA        0.948112       0.944892  192.843620   13.886815  199.002582   
6     RF        0.986163       0.953030   51.425505    7.171158  169.614987   
7     GB        0.965256       0.963166  129.127457   11.363426  133.014218   
8    XGB        0.979079       0.949901   77.751330    8.817671  180.915564   

   Test RMSE  
0  12.900470  
1  12.901827  
2  13.480141  
3  20.592447  
4  15.057142  
5  14.106827  
6  13.023632  
7  11.533179  
8  13.450486  


In [8]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    }

    # Train Gradient Boosting model
    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    return r2  # Maximizing R² score

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # Run for 50 trials

# Print best parameters
print("Best Parameters:", study.best_params)

[I 2025-04-16 14:30:08,974] A new study created in memory with name: no-name-92b5e3b3-0239-47c2-99fc-1762c880f4f3
[I 2025-04-16 14:30:10,583] Trial 0 finished with value: 0.9611142451112497 and parameters: {'n_estimators': 234, 'learning_rate': 0.07242003450661359, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 10, 'subsample': 0.5351986105664861}. Best is trial 0 with value: 0.9611142451112497.
[I 2025-04-16 14:30:11,302] Trial 1 finished with value: 0.9610206308694734 and parameters: {'n_estimators': 69, 'learning_rate': 0.04243722536026805, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 10, 'subsample': 0.530314859019585}. Best is trial 0 with value: 0.9611142451112497.
[I 2025-04-16 14:30:19,422] Trial 2 finished with value: 0.9416558093495099 and parameters: {'n_estimators': 477, 'learning_rate': 0.2496417861925946, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4, 'subsample': 0.9618350232451118}. Best is trial 0 with value: 0.96111424511124

Best Parameters: {'n_estimators': 403, 'learning_rate': 0.03166404838497485, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 7, 'subsample': 0.7760539088548032}


In [9]:
gbr2 = GradientBoostingRegressor(
    n_estimators=403,
    learning_rate=0.03166404838497485,
    max_depth=3,
    min_samples_split=7,
    min_samples_leaf=7,
    subsample=0.7760539088548032
)

gbr2.fit(X_train, y_train)

In [10]:
y_train_predict = gbr2.predict(X_train)
y_test_predict = gbr2.predict(X_test)

training_r2 = r2_score(y_train, y_train_predict)
testing_r2 = r2_score(y_test, y_test_predict)

training_mse = mean_squared_error(y_train, y_train_predict)
testing_mse = mean_squared_error(y_test, y_test_predict)

training_rmse = np.sqrt(training_mse)
testing_rmse = np.sqrt(testing_mse)

print(f"Train R² Score: {training_r2:.4f}")
print(f"Test R² Score: {testing_r2:.4f}")
print(f"Train MSE: {training_mse:.4f}")
print(f"Test MSE: {testing_mse:.4f}")
print(f"Train RMSE: {training_rmse:.4f}")
print(f"Test RMSE: {testing_rmse:.4f}")

Train R² Score: 0.9655
Test R² Score: 0.9632
Train MSE: 128.3564
Test MSE: 132.9396
Train RMSE: 11.3294
Test RMSE: 11.5299


In [11]:
import joblib

joblib.dump(gbr2, 'Models/ship_model_4.joblib')

['Models/ship_model_4.joblib']