## Results

No of records : 19480   
Ship Type : Tanker    
Training Accuracy : 0.8775  
Testing Accuracy : 0.8336   

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [12]:
df_train = pd.read_excel('Data_2024/ShipType3.xlsx')
df_test = pd.read_excel('Data_2025/ShipType3.xlsx')

pd.set_option('display.max_columns', None)
df_train.columns = df_train.columns.str.replace(' ', '_')
df_train = df_train.fillna(0)
df_train = df_train.loc[:, ~df_train.columns.duplicated()]

df_test.columns = df_test.columns.str.replace(' ', '_')
df_test = df_test.fillna(0)
df_test = df_test.loc[:, ~df_test.columns.duplicated()]

In [13]:
df_train = df_train.select_dtypes(include=['float64', 'int64'])

corr = df_train.corr()['TOTALCO2EMISSION'].sort_values(ascending=False)
corr.head(30)

TOTALCO2EMISSION        1.000000
Distance                0.777659
ME_RunningHour          0.756392
AE_HFO                  0.542189
FCPH                    0.495353
AVGSPEED                0.475713
OPERATORID              0.396575
FCPD_YTD                0.381194
ME_HFO                  0.373053
ME_MDO                  0.370010
AE_Boiler_HFO           0.348013
AVGSPEED_YTD            0.314454
ROB_HFO                 0.313860
FCPH_YTD                0.312745
wave_height             0.284632
IGG_MDO                 0.276382
wind_bft                0.252655
ISIMODCS_SEEM2REVIEW    0.248570
isCCIIEnabled           0.244260
Boiler_HFO              0.218282
Boiler_MDO              0.200568
Boiler_MDO/MGO          0.200568
RID.1                   0.189008
RID                     0.189008
TOTALCO2EMISSION_YTD    0.187962
swell_height            0.186305
FRM_MDO                 0.182645
GrossTonnage            0.180429
CII_REFERENCE           0.178836
REQUIRED_CII            0.178736
Name: TOTA

In [14]:
df_train.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_train = df_train[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_train = df_train['TOTALCO2EMISSION']

In [15]:
df_test.rename(columns={'Distance': 'Distance',
                   'ME_RunningHour': 'SteamingTime',
                   'DeadWeight': 'Deadweight',
                   'GrossTonnage': 'GrossTonnage',
                   'ME_MDO/MGO': 'ME_MDO/MGO',
                   'ME_HFO': 'ME_HFO',
                   'ME_LFO': 'ME_LFO',
                   'AE_Boiler_MDO/MGO':'AE_Boiler_MDO/MGO',
                   'AE_Boiler_HFO':'AE_Boiler_HFO',
                   'AE_Boiler_LFO':'AE_Boiler_LFO'}, inplace=True)

X_test = df_test[['Deadweight', 'GrossTonnage',
        'Distance', 'SteamingTime',
        'ME_MDO/MGO', 'ME_HFO',
        'ME_LFO', 'AE_Boiler_MDO/MGO',
        'AE_Boiler_HFO', 'AE_Boiler_LFO']]


y_test = df_test['TOTALCO2EMISSION']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("Training set size:", X_train.shape, y_train.shape)
print("Test set size:", X_test.shape, y_test.shape)

Training set size: (13051, 10) (13051,)
Test set size: (6429, 10) (6429,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import AdaBoostRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [16]:
models = [
    ('LR', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('L_SVR', LinearSVR()),
    ('DT', DecisionTreeRegressor()),
    ('ADA', AdaBoostRegressor()),
    ('RF', RandomForestRegressor()),
    ('GB', GradientBoostingRegressor()),
    ('XGB', XGBRegressor())
]

results = []

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute R² scores
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Compute MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)

    # Compute RMSE
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)

    # Append results as a new row
    results.append([name, train_r2, test_r2, train_mse, train_rmse, test_mse, test_rmse])

# Create DataFrame for better visualization
results_df = pd.DataFrame(results, columns=['Model', 'Train R² Score', 'Test R² Score', 'Train MSE', 'Train RMSE', 'Test MSE', 'Test RMSE'])

# Print the results
print(results_df)

  model = cd_fast.enet_coordinate_descent(


   Model  Train R² Score  Test R² Score    Train MSE  Train RMSE     Test MSE  \
0     LR        0.731138       0.758958    97.829260    9.890868    85.268622   
1  Ridge        0.731138       0.758982    97.829297    9.890869    85.260132   
2  Lasso        0.702499       0.742071   108.249824   10.404318    91.242378   
3  L_SVR      -20.479574     -23.104338  7815.647872   88.406153  8526.907619   
4     DT        0.940664       0.753567    21.590215    4.646527    87.175627   
5    ADA        0.745464       0.712747    92.616400    9.623741   101.615601   
6     RF        0.929999       0.817047    25.470697    5.046850    64.719565   
7     GB        0.849615       0.828201    54.719678    7.397275    60.773950   
8    XGB        0.901313       0.817041    35.908679    5.992385    64.721590   

   Test RMSE  
0   9.234101  
1   9.233641  
2   9.552088  
3  92.341256  
4   9.336789  
5  10.080456  
6   8.044847  
7   7.795765  
8   8.044973  


In [17]:
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "max_depth": trial.suggest_int("max_depth", 5, 50, step=5),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "random_state": 42
    }
    
    # Train Random Forest Model
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    
    # Predict & Evaluate
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)  # Optuna maximizes R²
    
    return score  # Higher R² is better

# Run Optuna Optimization
study = optuna.create_study(direction="maximize")  # Maximize R² score
study.optimize(objective, n_trials=50)  # Run for 50 trials

# Best Parameters
print("Best Parameters:", study.best_params)

[I 2025-04-16 15:09:10,792] A new study created in memory with name: no-name-177dd1dd-e1bf-4078-817c-03e0ebff61af
[I 2025-04-16 15:09:15,567] Trial 0 finished with value: 0.8042319323640175 and parameters: {'n_estimators': 400, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.8042319323640175.
[I 2025-04-16 15:09:21,650] Trial 1 finished with value: 0.8332650815860982 and parameters: {'n_estimators': 450, 'max_depth': 35, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 1 with value: 0.8332650815860982.
[I 2025-04-16 15:09:22,821] Trial 2 finished with value: 0.8284304955202884 and parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.8332650815860982.
[I 2025-04-16 15:09:27,785] Trial 3 finished with value: 0.8314219266714585 and parameters: {'n_estimators': 500, 'max_depth': 40, 'min

Best Parameters: {'n_estimators': 450, 'max_depth': 45, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 'log2'}


In [18]:
rf = RandomForestRegressor(
    n_estimators=450,
    max_depth=45,
    min_samples_split=11,
    min_samples_leaf=4,
    max_features='log2',
    random_state=42
)

# Train the model
rf.fit(X_train, y_train)

In [19]:
y_train_predict = rf.predict(X_train)
y_test_predict = rf.predict(X_test)

training_r2 = r2_score(y_train, y_train_predict)
testing_r2 = r2_score(y_test, y_test_predict)

training_mse = mean_squared_error(y_train, y_train_predict)
testing_mse = mean_squared_error(y_test, y_test_predict)

training_rmse = np.sqrt(training_mse)
testing_rmse = np.sqrt(testing_mse)

print(f"Train R² Score: {training_r2:.4f}")
print(f"Test R² Score: {testing_r2:.4f}")
print(f"Train MSE: {training_mse:.4f}")
print(f"Test MSE: {testing_mse:.4f}")
print(f"Train RMSE: {training_rmse:.4f}")
print(f"Test RMSE: {testing_rmse:.4f}")

Train R² Score: 0.8775
Test R² Score: 0.8336
Train MSE: 44.5563
Test MSE: 58.8623
Train RMSE: 6.6750
Test RMSE: 7.6722


In [20]:
import joblib

joblib.dump(rf, 'Models/ship_model_3.joblib')

['Models/ship_model_3.joblib']