In [7]:
import os
import json
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats

fn = "ship_fuel_efficiency.csv"
df = pd.read_csv(fn)
print("Loaded:", fn, "shape:", df.shape)

keep_cols = ['distance', 'fuel_consumption', 'CO2_emissions', 'ship_type', 'engine_efficiency']
df = df[keep_cols].copy()
print("After keep_cols, shape:", df.shape)

print("\nMissing values per column:")
print(df.isnull().sum())

numeric_cols = ['distance', 'fuel_consumption', 'CO2_emissions']
zs = np.abs(stats.zscore(df[numeric_cols], nan_policy='omit'))
outlier_counts = (zs > 3).sum(axis=0)
print("\nOutlier counts (|z|>3):")
print(dict(zip(numeric_cols, outlier_counts)))

X_for_vif = pd.get_dummies(df.drop(columns=['engine_efficiency']), drop_first=True)
for c in X_for_vif.columns:
    X_for_vif[c] = pd.to_numeric(X_for_vif[c], errors='coerce')
X_for_vif = X_for_vif.dropna(axis=1, how='all')
X_for_vif = X_for_vif.loc[:, X_for_vif.std() > 0]
non_numeric = [c for c in X_for_vif.columns if not np.issubdtype(X_for_vif[c].dtype, np.number)]
if non_numeric:
    print("\n⚠️ Non-numeric columns detected and dropped:", non_numeric)
    X_for_vif = X_for_vif.drop(columns=non_numeric)
print("\n✅ Cleaned for VIF. Final shape:", X_for_vif.shape)
print("Columns used for VIF:", list(X_for_vif.columns))

def calculate_vif(df_numeric):
    vif_data = []
    arr = df_numeric.values
    n_cols = arr.shape[1]
    for i in range(n_cols):
        try:
            vif = variance_inflation_factor(arr, i)
        except Exception:
            vif = np.nan
        vif_data.append(vif)
    return pd.DataFrame({"Feature": df_numeric.columns, "VIF": vif_data})

vif_result = calculate_vif(X_for_vif)
vif_result_sorted = vif_result.sort_values(by="VIF", ascending=False).reset_index(drop=True)
print("\nTop VIFs:")
print(vif_result_sorted.head(20))

y = df['engine_efficiency']
X = df.drop(columns=['engine_efficiency'])
categorical_cols = ['ship_type']
cat_values = {c: sorted(list(X[c].dropna().unique())) for c in categorical_cols}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("\nTrain/Test shapes:", X_train.shape, X_test.shape)

try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', ohe, categorical_cols)
    ],
    remainder='drop'
)

pipe_linear = Pipeline([('preproc', preprocessor), ('model', LinearRegression())])
pipe_lasso = Pipeline([('preproc', preprocessor), ('model', LassoCV(cv=5, random_state=42, max_iter=20000))])
pipe_ridge = Pipeline([('preproc', preprocessor), ('model', RidgeCV(alphas=np.logspace(-3,3,25), cv=5))])

print("\nTraining Linear...")
pipe_linear.fit(X_train, y_train)
print("Training LassoCV (tuning alphas)...")
pipe_lasso.fit(X_train, y_train)
print("Training RidgeCV (tuning alphas)...")
pipe_ridge.fit(X_train, y_train)

def eval_pipe(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    rmse_val = np.sqrt(mean_squared_error(y_test, y_pred))
    return dict(R2=float(r2_score(y_test, y_pred)),
                MAE=float(mean_absolute_error(y_test, y_pred)),
                RMSE=float(rmse_val))

results = {
    'Linear': eval_pipe(pipe_linear, X_test, y_test),
    'Lasso': eval_pipe(pipe_lasso, X_test, y_test),
    'Ridge': eval_pipe(pipe_ridge, X_test, y_test)
}

print("\nModel evaluation results:")
for k, v in results.items():
    print(k, v)

best_name = max(results.items(), key=lambda it: it[1]['R2'])[0]
best_pipeline = {'Linear': pipe_linear, 'Lasso': pipe_lasso, 'Ridge': pipe_ridge}[best_name]
print("\n✅ Selected best model:", best_name, results[best_name])

os.makedirs("artifacts", exist_ok=True)
joblib.dump(best_pipeline, "artifacts/best_model.pkl")

meta = {
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols,
    "cat_values": cat_values,
    "target": "engine_efficiency",
    "model_name": best_name
}
with open("artifacts/model_columns.json", "w") as f:
    json.dump(meta, f, indent=2)

df_test = X_test.copy()
df_test['true_engine_efficiency'] = y_test.values
df_test['pred_' + best_name] = best_pipeline.predict(X_test)
df_test.to_csv("artifacts/test_predictions_sample.csv", index=False)

print("\nArtifacts saved ✅")
print("→ artifacts/best_model.pkl")
print("→ artifacts/model_columns.json")
print("→ artifacts/test_predictions_sample.csv")

pipe = best_pipeline
preproc = pipe.named_steps['preproc']
model_inner = pipe.named_steps['model']

feature_names = []
if hasattr(preproc, "get_feature_names_out"):
    feature_names = list(preproc.get_feature_names_out())
else:
    feature_names = numeric_cols.copy()
    for name, transformer, cols in preproc.transformers_:
        if name == 'cat':
            cats = transformer.categories_
            for colname, catvals in zip(cols, cats):
                for v in catvals:
                    feature_names.append(f"{colname}_{v}")

coefs = np.ravel(model_inner.coef_)
intercept = model_inner.intercept_
print("\nIntercept:", intercept)
coef_df = pd.DataFrame({"feature": feature_names, "coef": np.round(coefs, 6)})
print("\nCoefficients:")
print(coef_df.to_string(index=False))

print("\nSample X_test rows:")
print(X_test.head().to_string(index=False))
print("\nSample predictions:")
print(best_pipeline.predict(X_test.head()).tolist())

print("\nNotebook 2 complete ✅")
print("Best model selected:", best_name)
print("Artifacts ready for Streamlit.")


Loaded: ship_fuel_efficiency.csv shape: (1440, 10)
After keep_cols, shape: (1440, 5)

Missing values per column:
distance             0
fuel_consumption     0
CO2_emissions        0
ship_type            0
engine_efficiency    0
dtype: int64

Outlier counts (|z|>3):
{'distance': np.int64(17), 'fuel_consumption': np.int64(29), 'CO2_emissions': np.int64(28)}

⚠️ Non-numeric columns detected and dropped: ['ship_type_Oil Service Boat', 'ship_type_Surfer Boat', 'ship_type_Tanker Ship']

✅ Cleaned for VIF. Final shape: (1440, 3)
Columns used for VIF: ['distance', 'fuel_consumption', 'CO2_emissions']

Top VIFs:
            Feature         VIF
0  fuel_consumption  372.565986
1     CO2_emissions  358.286668
2          distance   13.752275

Train/Test shapes: (1152, 4) (288, 4)

Training Linear...
Training LassoCV (tuning alphas)...
Training RidgeCV (tuning alphas)...

Model evaluation results:
Linear {'R2': -0.0017518607872868408, 'MAE': 6.037245881025557, 'RMSE': 7.044450455700925}
Lasso {'R2':