In [9]:
import pandas as pd
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import joblib
import numpy as np

# Load the dataset from the Excel file
data = pd.read_excel('Flight Data - V1.xlsx')

# Specify the target column (the weight of the clay mass)
target_column = "Weight of Clay Mass"
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found in the dataset. Available columns: {data.columns.tolist()}")

# Define inputs (features) and output (target)
X = data.drop(columns=[target_column])
y = data[target_column]

# Define candidate regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR(),
}

# Use Leave-One-Out Cross Validation (LOOCV)
loo = LeaveOneOut()
cv_scores = {}

# Use negative mean squared error as our scoring metric
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=loo, scoring='neg_mean_squared_error')
    mean_score = np.mean(scores)
    cv_scores[name] = mean_score
    print(f"{name} LOOCV Negative MSE: {mean_score:.4f}")

# Determine the best model (highest negative MSE, i.e. closest to 0 is best)
best_model_name = max(cv_scores, key=cv_scores.get)
best_score = cv_scores[best_model_name]
print(f"\nBest Model: {best_model_name} with LOOCV Negative MSE: {best_score:.4f}")

# Retrain the best model on the entire dataset
best_model = models[best_model_name]
best_model.fit(X, y)

# Export the best model to a file using joblib
joblib.dump(best_model, 'best_regression_model.pkl')
print("\nBest model exported as 'best_regression_model.pkl'")


Linear Regression LOOCV Negative MSE: -104.8742
Ridge Regression LOOCV Negative MSE: -101.6232
Decision Tree LOOCV Negative MSE: -1213.6667
Random Forest LOOCV Negative MSE: -1651.0002
Gradient Boosting LOOCV Negative MSE: -1344.0436
SVR LOOCV Negative MSE: -2119.7269

Best Model: Ridge Regression with LOOCV Negative MSE: -101.6232

Best model exported as 'best_regression_model.pkl'
