## Results

No of records : 3115   
Ship Type : Refrigerated Cargo Carrier   
Training Accuracy : 0.9485      
Testing Accuracy : 0.9625     

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df_train = pd.read_excel('Data_2024/ShipType6.xlsx')
df_test = pd.read_excel('Data_2025/ShipType6.xlsx')

pd.set_option('display.max_columns', None)
df_train.columns = df_train.columns.str.replace(' ', '_')
df_train = df_train.fillna(0)
df_train = df_train.loc[:, ~df_train.columns.duplicated()]

df_test.columns = df_test.columns.str.replace(' ', '_')
df_test = df_test.fillna(0)
df_test = df_test.loc[:, ~df_test.columns.duplicated()]

In [3]:
df_train = df_train.select_dtypes(include=['float64', 'int64'])

corr = df_train.corr()['TOTALCO2EMISSION'].sort_values(ascending=False)
corr.head(30)

TOTALCO2EMISSION                  1.000000
Distance                          0.955868
ME_RunningHour                    0.951578
AVGSPEED                          0.773904
FCPH                              0.764255
wind_bft                          0.546792
REQUIRED_CII                      0.538067
CII_REFERENCE                     0.538061
swell_height                      0.527119
wave_height                       0.521021
FCPD_YTD                          0.491279
FCPD                              0.380936
AE_HFO                            0.375137
AE_LFO                            0.328311
ME_MDO                            0.319471
ME_HFO                            0.301974
ME_LFO                            0.224712
AE_MDO                            0.167238
AE_MGO/MDO                        0.167238
draft_fwd                         0.130021
wave_dir                          0.128481
ME_MDO/MGO                        0.117650
draft_aft                         0.105827
draft_mid  

In [4]:
df_train.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_train = df_train[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_train = df_train['TOTALCO2EMISSION']

In [5]:
df_test.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_test = df_test[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_test = df_test['TOTALCO2EMISSION']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape, y_train.shape)
print("Test set size:", X_test.shape, y_test.shape)

Training set size: (2492, 10) (2492,)
Test set size: (623, 10) (623,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [7]:
models = [
    ('LR', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('L_SVR', LinearSVR()),
    ('DT', DecisionTreeRegressor()),
    ('ADA', AdaBoostRegressor()),
    ('RF', RandomForestRegressor()),
    ('GB', GradientBoostingRegressor()),
    ('XGB', XGBRegressor())
]

results = []

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute R² scores
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Compute MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Compute RMSE
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    # Append results as a new row
    results.append([name, train_r2, test_r2, train_mse, train_rmse, test_mse, test_rmse])

# Create DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Train R² Score', 'Test R² Score', 'Train MSE', 'Train RMSE', 'Test MSE', 'Test RMSE'])

# Print the results
print(results_df)

  model = cd_fast.enet_coordinate_descent(


   Model  Train R² Score  Test R² Score  Train MSE  Train RMSE    Test MSE  \
0     LR        0.931112       0.963266  79.549791    8.919069   46.037417   
1  Ridge        0.931111       0.963260  79.550454    8.919106   46.044766   
2  Lasso        0.921938       0.956585  90.143241    9.494379   54.410164   
3  L_SVR        0.916348       0.948369  96.599160    9.828487   64.706020   
4     DT        0.986530       0.936952  15.554716    3.943947   79.015245   
5    ADA        0.915499       0.916226  97.578944    9.878205  104.990095   
6     RF        0.979578       0.958717  23.582755    4.856208   51.737489   
7     GB        0.956486       0.961752  50.248852    7.088642   47.934781   
8    XGB        0.984306       0.954694  18.122438    4.257046   56.779671   

   Test RMSE  
0   6.785088  
1   6.785629  
2   7.376325  
3   8.044005  
4   8.889052  
5  10.246467  
6   7.192878  
7   6.923495  
8   7.535229  


In [8]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    }

    # Train Gradient Boosting model
    model = GradientBoostingRegressor(**params)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    return r2  # Maximizing R² score

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # Run for 50 trials

# Print best parameters
print("Best Parameters:", study.best_params)

[I 2025-04-16 14:55:03,213] A new study created in memory with name: no-name-e03b6a9e-f0a8-49b9-b614-7278cdd2ecd2
[I 2025-04-16 14:55:03,753] Trial 0 finished with value: 0.9429141013470669 and parameters: {'n_estimators': 136, 'learning_rate': 0.2643186278599591, 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 7, 'subsample': 0.5407469925594708}. Best is trial 0 with value: 0.9429141013470669.
[I 2025-04-16 14:55:04,642] Trial 1 finished with value: 0.9453273224608039 and parameters: {'n_estimators': 225, 'learning_rate': 0.16755752574715588, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 5, 'subsample': 0.6396180183676133}. Best is trial 1 with value: 0.9453273224608039.
[I 2025-04-16 14:55:05,799] Trial 2 finished with value: 0.9570395161917995 and parameters: {'n_estimators': 460, 'learning_rate': 0.12869373764446407, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 1, 'subsample': 0.5331053220366198}. Best is trial 2 with value: 0.95703951619179

Best Parameters: {'n_estimators': 53, 'learning_rate': 0.1520116588564339, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 10, 'subsample': 0.5000357427958474}


In [9]:
gbr3 = GradientBoostingRegressor(
    n_estimators=53,
    learning_rate=0.1520116588564339,
    max_depth=3,
    min_samples_split=9,
    min_samples_leaf=10,
    subsample=0.5000357427958474
)

gbr3.fit(X_train, y_train)

In [10]:
y_train_predict = gbr3.predict(X_train)
y_test_predict = gbr3.predict(X_test)

training_r2 = r2_score(y_train, y_train_predict)
testing_r2 = r2_score(y_test, y_test_predict)

training_mse = mean_squared_error(y_train, y_train_predict)
testing_mse = mean_squared_error(y_test, y_test_predict)

training_rmse = np.sqrt(training_mse)
testing_rmse = np.sqrt(testing_mse)

print(f"Train R² Score: {training_r2:.4f}")
print(f"Test R² Score: {testing_r2:.4f}")
print(f"Train MSE: {training_mse:.4f}")
print(f"Test MSE: {testing_mse:.4f}")
print(f"Train RMSE: {training_rmse:.4f}")
print(f"Test RMSE: {testing_rmse:.4f}")

Train R² Score: 0.9485
Test R² Score: 0.9625
Train MSE: 59.4651
Test MSE: 47.0524
Train RMSE: 7.7114
Test RMSE: 6.8595


In [11]:
import joblib

joblib.dump(gbr3, 'Models/ship_model_6.joblib')

['Models/ship_model_6.joblib']