# Modeling

This notebook implements nested cross-validation for multiple models to predict milk yield.
Models are run in order from fastest to slowest:
1. Decision Trees (fastest)
2. Random Forests
3. XGBoost
4. K-Nearest Neighbors (KNN)
5. Neural Networks
6. Support Vector Machine (SVM) Regressor (slowest)

We'll use nested CV to find optimal hyperparameters and create an ensemble of the top 3 models.
All models (including CV fold models) are saved in the models folder.

In [3]:
# Imports
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor

import joblib
import time

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)


## Load and Prepare Data


In [4]:
# Load cleaned data
# TRAIN_PATH = "cattle_data_cleaned_train.csv"
# TEST_PATH = "cattle_data_cleaned_test.csv"

TRAIN_PATH = "cleaned_train.csv"
TEST_PATH = "cleaned_test.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain columns: {train.columns.tolist()}")


Train shape: (209926, 34)
Test shape: (40000, 33)

Train columns: ['Cattle_ID', 'Breed', 'Climate_Zone', 'Management_System', 'Age_Months', 'Weight_kg', 'Parity', 'Lactation_Stage', 'Days_in_Milk', 'Feed_Type', 'Feed_Quantity_kg', 'Feeding_Frequency', 'Water_Intake_L', 'Walking_Distance_km', 'Grazing_Duration_hrs', 'Resting_Hours', 'Ambient_Temperature_C', 'Humidity_percent', 'Housing_Score', 'FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine', 'Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine', 'Previous_Week_Avg_Yield', 'Body_Condition_Score', 'Milking_Interval_hrs', 'Date', 'Farm_ID', 'Mastitis', 'Milk_Yield_L']


In [5]:
# Separate features and target
# Assuming target column is 'Milk_Yield_L' and ID columns should be excluded
id_cols = ['Cattle_ID'] if 'Cattle_ID' in train.columns else []
target_col = 'Milk_Yield_L'

X_train = train.drop(columns=[target_col] + id_cols, errors='ignore')
y_train = train[target_col]

X_test = test.drop(columns=id_cols, errors='ignore')

print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")


Training features shape: (209926, 32)
Training target shape: (209926,)
Test features shape: (40000, 32)


In [6]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")


Categorical columns (7): ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type', 'Date', 'Farm_ID']

Numerical columns (25): ['Age_Months', 'Weight_kg', 'Parity', 'Days_in_Milk', 'Feed_Quantity_kg', 'Feeding_Frequency', 'Water_Intake_L', 'Walking_Distance_km', 'Grazing_Duration_hrs', 'Resting_Hours', 'Ambient_Temperature_C', 'Humidity_percent', 'Housing_Score', 'FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine', 'Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine', 'Previous_Week_Avg_Yield', 'Body_Condition_Score', 'Milking_Interval_hrs', 'Mastitis']


## Nested Cross-Validation Setup

We'll use 5-fold outer CV and 3-fold inner CV for hyperparameter tuning.


In [7]:
# Setup nested CV
OUTER_CV = 5
INNER_CV = 3

# Use subset of data for CV evaluation to speed up (set to None to use full data)
# For 210k data points, using 50k-100k is usually sufficient for hyperparameter tuning
CV_SAMPLE_SIZE = 75000  # Use 50k samples for CV evaluation

outer_cv = KFold(n_splits=OUTER_CV, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=INNER_CV, shuffle=True, random_state=42)

print(f"Outer CV folds: {OUTER_CV}")
print(f"Inner CV folds: {INNER_CV}")
if CV_SAMPLE_SIZE:
    print(f"CV evaluation will use {CV_SAMPLE_SIZE:,} samples (subset of full data)")
else:
    print("CV evaluation will use full dataset")


Outer CV folds: 5
Inner CV folds: 3
CV evaluation will use 75,000 samples (subset of full data)


## Preprocessing Pipeline


In [8]:
# Create preprocessing pipeline
# For models that need encoding (tree-based can handle categories, but we'll encode for consistency)
from sklearn.preprocessing import OneHotEncoder

# Preprocessor for models that need scaling (Neural Networks, KNN, SVM)
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ],
    remainder='drop'
)

# Preprocessor for tree-based models (no scaling needed)
preprocessor_tree = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ],
    remainder='drop'
)

print("Preprocessing pipelines created")


Preprocessing pipelines created


## Model Definitions and Hyperparameter Grids


In [9]:
# Define models and their hyperparameter grids
# Ordered from fastest to slowest for 210k data points
models_config = {
    'DecisionTree': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_tree),
            ('model', DecisionTreeRegressor(random_state=42))
        ]),
        'param_grid': {
            'model__max_depth': [10, 20, 30, 40, None],
            'model__min_samples_split': [2, 5, 10, 20],
            'model__min_samples_leaf': [1, 2, 4, 8],
            'model__max_features': ['sqrt', 'log2', None]
        }
    },
    'RandomForest': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_tree),
            ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
        ]),
        'param_grid': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [10, 20, 30, None],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
            'model__max_features': ['sqrt', 'log2']
        }
    },
    'XGBoost': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_tree),
            ('model', XGBRegressor(random_state=42, n_jobs=-1, eval_metric='rmse'))
        ]),
        'param_grid': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [3, 5, 7, 9],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__subsample': [0.8, 0.9, 1.0],
            'model__colsample_bytree': [0.8, 0.9, 1.0]
        }
    },
    'KNN': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_scaled),
            ('model', KNeighborsRegressor())
        ]),
        'param_grid': {
            'model__n_neighbors': [3, 5, 7, 9, 11, 15],
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2]  # 1 for Manhattan, 2 for Euclidean
        }
    },
    'NeuralNetwork': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_scaled),
            ('model', MLPRegressor(random_state=42, max_iter=500, early_stopping=True))
        ]),
        'param_grid': {
            'model__hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100)],
            'model__alpha': [0.0001, 0.001, 0.01],
            'model__learning_rate_init': [0.001, 0.01],
            'model__activation': ['relu', 'tanh']
        }
    },
    'SVM': {
        'pipeline': Pipeline([
            ('preprocessor', preprocessor_scaled),
            ('model', SVR())
        ]),
        'param_grid': {
            'model__kernel': ['rbf', 'linear', 'poly'],
            'model__C': [0.1, 1, 10, 100],
            'model__gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
            'model__epsilon': [0.01, 0.1, 0.5, 1.0]
        }
    }
}

print(f"Configured {len(models_config)} models for evaluation (ordered from fastest to slowest)")


Configured 6 models for evaluation (ordered from fastest to slowest)


## Nested Cross-Validation Function


In [10]:
def nested_cv_evaluation(X, y, model_name, pipeline, param_grid, outer_cv, inner_cv, sample_size=None):
    """
    Perform nested cross-validation for a model.
    Saves each fold's best model to the models folder.
    
    Args:
        X: Features
        y: Target
        model_name: Name of the model
        pipeline: Model pipeline
        param_grid: Hyperparameter grid
        outer_cv: Outer cross-validation splitter
        inner_cv: Inner cross-validation splitter
        sample_size: If provided, use a random sample of this size for CV evaluation
    
    Returns: outer_scores, best_params, best_model
    """
    print(f"\n{'='*60}")
    print(f"Evaluating {model_name}")
    print(f"{'='*60}")
    
    # Sample data if sample_size is specified
    if sample_size and len(X) > sample_size:
        print(f"Sampling {sample_size:,} from {len(X):,} data points for CV evaluation...")
        sample_indices = np.random.RandomState(42).choice(len(X), size=sample_size, replace=False)
        X_cv = X.iloc[sample_indices].reset_index(drop=True)
        y_cv = y.iloc[sample_indices].reset_index(drop=True)
    else:
        X_cv = X
        y_cv = y
    
    outer_scores = []
    best_params_list = []
    best_models = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(outer_cv.split(X_cv, y_cv)):
        print(f"\nOuter Fold {fold_idx + 1}/{outer_cv.n_splits}")
        
        X_train_fold, X_val_fold = X_cv.iloc[train_idx], X_cv.iloc[val_idx]
        y_train_fold, y_val_fold = y_cv.iloc[train_idx], y_cv.iloc[val_idx]
        
        # Inner CV for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=inner_cv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        
        start_time = time.time()
        grid_search.fit(X_train_fold, y_train_fold)
        fit_time = time.time() - start_time
        
        # Evaluate on validation set
        y_pred = grid_search.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_val_fold, y_pred)
        r2 = r2_score(y_val_fold, y_pred)
        
        outer_scores.append({
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        })
        
        best_params_list.append(grid_search.best_params_)
        best_models.append(grid_search.best_estimator_)
        
        # Save this fold's best model
        model_path = f'models/{model_name.lower()}_fold_{fold_idx + 1}.pkl'
        joblib.dump(grid_search.best_estimator_, model_path)
        print(f"  Saved model to {model_path}")
        
        print(f"  Best params: {grid_search.best_params_}")
        print(f"  Validation RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
        print(f"  Fit time: {fit_time:.2f}s")
    
    # Aggregate results
    avg_rmse = np.mean([s['rmse'] for s in outer_scores])
    avg_mae = np.mean([s['mae'] for s in outer_scores])
    avg_r2 = np.mean([s['r2'] for s in outer_scores])
    std_rmse = np.std([s['rmse'] for s in outer_scores])
    
    print(f"\n{model_name} Results:")
    print(f"  Average RMSE: {avg_rmse:.4f} (±{std_rmse:.4f})")
    print(f"  Average MAE: {avg_mae:.4f}")
    print(f"  Average R²: {avg_r2:.4f}")
    
    return {
        'outer_scores': outer_scores,
        'best_params': best_params_list,
        'best_models': best_models,
        'avg_rmse': avg_rmse,
        'avg_mae': avg_mae,
        'avg_r2': avg_r2,
        'std_rmse': std_rmse
    }


In [11]:
# Run nested CV for all models
results = {}

for model_name, config in models_config.items():
    result = nested_cv_evaluation(
        X_train, y_train,
        model_name,
        config['pipeline'],
        config['param_grid'],
        outer_cv,
        inner_cv,
        sample_size=CV_SAMPLE_SIZE  # Use subset for faster CV evaluation
    )
    results[model_name] = result



Evaluating DecisionTree
Sampling 75,000 from 209,926 data points for CV evaluation...

Outer Fold 1/5




  Saved model to models/decisiontree_fold_1.pkl
  Best params: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
  Validation RMSE: 4.6551, MAE: 3.6222, R²: 0.2401
  Fit time: 358.53s

Outer Fold 2/5




  Saved model to models/decisiontree_fold_2.pkl
  Best params: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
  Validation RMSE: 4.6247, MAE: 3.6079, R²: 0.2611
  Fit time: 347.84s

Outer Fold 3/5




  Saved model to models/decisiontree_fold_3.pkl
  Best params: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
  Validation RMSE: 4.5484, MAE: 3.5649, R²: 0.2657
  Fit time: 399.33s

Outer Fold 4/5




  Saved model to models/decisiontree_fold_4.pkl
  Best params: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
  Validation RMSE: 4.6101, MAE: 3.6011, R²: 0.2630
  Fit time: 360.99s

Outer Fold 5/5
  Saved model to models/decisiontree_fold_5.pkl
  Best params: {'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
  Validation RMSE: 4.5953, MAE: 3.5743, R²: 0.2717
  Fit time: 339.44s

DecisionTree Results:
  Average RMSE: 4.6067 (±0.0352)
  Average MAE: 3.5941
  Average R²: 0.2603

Evaluating RandomForest
Sampling 75,000 from 209,926 data points for CV evaluation...

Outer Fold 1/5




  Saved model to models/randomforest_fold_1.pkl
  Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}
  Validation RMSE: 4.4930, MAE: 3.5190, R²: 0.2921
  Fit time: 1549.27s

Outer Fold 2/5




  Saved model to models/randomforest_fold_2.pkl
  Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}
  Validation RMSE: 4.5136, MAE: 3.5285, R²: 0.2961
  Fit time: 1559.22s

Outer Fold 3/5




  Saved model to models/randomforest_fold_3.pkl
  Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
  Validation RMSE: 4.4215, MAE: 3.4787, R²: 0.3061
  Fit time: 1568.43s

Outer Fold 4/5




  Saved model to models/randomforest_fold_4.pkl
  Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
  Validation RMSE: 4.5012, MAE: 3.5138, R²: 0.2975
  Fit time: 1572.93s

Outer Fold 5/5




  Saved model to models/randomforest_fold_5.pkl
  Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 200}
  Validation RMSE: 4.4923, MAE: 3.4975, R²: 0.3040
  Fit time: 1541.96s

RandomForest Results:
  Average RMSE: 4.4843 (±0.0323)
  Average MAE: 3.5075
  Average R²: 0.2992

Evaluating XGBoost
Sampling 75,000 from 209,926 data points for CV evaluation...

Outer Fold 1/5




  Saved model to models/xgboost_fold_1.pkl
  Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
  Validation RMSE: 4.2641, MAE: 3.3210, R²: 0.3624
  Fit time: 1596.73s

Outer Fold 2/5




  Saved model to models/xgboost_fold_2.pkl
  Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
  Validation RMSE: 4.2326, MAE: 3.2963, R²: 0.3810
  Fit time: 1589.62s

Outer Fold 3/5




  Saved model to models/xgboost_fold_3.pkl
  Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
  Validation RMSE: 4.1636, MAE: 3.2612, R²: 0.3847
  Fit time: 1587.44s

Outer Fold 4/5




  Saved model to models/xgboost_fold_4.pkl
  Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
  Validation RMSE: 4.2382, MAE: 3.2942, R²: 0.3771
  Fit time: 1575.39s

Outer Fold 5/5




  Saved model to models/xgboost_fold_5.pkl
  Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__subsample': 0.8}
  Validation RMSE: 4.2220, MAE: 3.2801, R²: 0.3852
  Fit time: 1577.70s

XGBoost Results:
  Average RMSE: 4.2241 (±0.0333)
  Average MAE: 3.2906
  Average R²: 0.3781

Evaluating KNN
Sampling 75,000 from 209,926 data points for CV evaluation...

Outer Fold 1/5
  Saved model to models/knn_fold_1.pkl
  Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'distance'}
  Validation RMSE: 4.6749, MAE: 3.6606, R²: 0.2336
  Fit time: 5411.23s

Outer Fold 2/5
  Saved model to models/knn_fold_2.pkl
  Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'distance'}
  Validation RMSE: 4.6721, MAE: 3.6577, R²: 0.2458
  Fit time: 5958.01s

Outer Fold 3/5
  Saved model to models/knn_fold_3.pkl
  Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'd



  Saved model to models/knn_fold_4.pkl
  Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'distance'}
  Validation RMSE: 4.6546, MAE: 3.6468, R²: 0.2487
  Fit time: 5863.74s

Outer Fold 5/5
  Saved model to models/knn_fold_5.pkl
  Best params: {'model__n_neighbors': 15, 'model__p': 2, 'model__weights': 'distance'}
  Validation RMSE: 4.6825, MAE: 3.6537, R²: 0.2438
  Fit time: 5748.52s

KNN Results:
  Average RMSE: 4.6611 (±0.0219)
  Average MAE: 3.6540
  Average R²: 0.2428

Evaluating NeuralNetwork
Sampling 75,000 from 209,926 data points for CV evaluation...

Outer Fold 1/5




  Saved model to models/neuralnetwork_fold_1.pkl
  Best params: {'model__activation': 'relu', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (50,), 'model__learning_rate_init': 0.001}
  Validation RMSE: 4.2712, MAE: 3.3259, R²: 0.3603
  Fit time: 395.19s

Outer Fold 2/5
  Saved model to models/neuralnetwork_fold_2.pkl
  Best params: {'model__activation': 'relu', 'model__alpha': 0.001, 'model__hidden_layer_sizes': (100,), 'model__learning_rate_init': 0.001}
  Validation RMSE: 4.2460, MAE: 3.3087, R²: 0.3771
  Fit time: 394.58s

Outer Fold 3/5
  Saved model to models/neuralnetwork_fold_3.pkl
  Best params: {'model__activation': 'relu', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (100,), 'model__learning_rate_init': 0.001}
  Validation RMSE: 4.1702, MAE: 3.2649, R²: 0.3827
  Fit time: 380.86s

Outer Fold 4/5
  Saved model to models/neuralnetwork_fold_4.pkl
  Best params: {'model__activation': 'relu', 'model__alpha': 0.01, 'model__hidden_layer_sizes': (100,), 'model__learning_ra

KeyboardInterrupt: 

## Compare Model Performance


In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Avg_RMSE': [results[m]['avg_rmse'] for m in results.keys()],
    'Std_RMSE': [results[m]['std_rmse'] for m in results.keys()],
    'Avg_MAE': [results[m]['avg_mae'] for m in results.keys()],
    'Avg_R2': [results[m]['avg_r2'] for m in results.keys()]
}).sort_values('Avg_RMSE')

print("\n" + "="*60)
print("MODEL COMPARISON (sorted by RMSE)")
print("="*60)
print(comparison_df.to_string(index=False))


## Select Top 3 Models and Create Ensemble


In [None]:
# Get top 3 models based on RMSE
top_3_models = comparison_df.head(3)['Model'].tolist()
print(f"Top 3 models selected: {top_3_models}")

# Get the most common best parameters for each top model
def get_most_common_params(best_params_list):
    """Get the most common parameter values across folds"""
    if not best_params_list:
        return {}
    
    param_counts = {}
    for params in best_params_list:
        for key, value in params.items():
            if key not in param_counts:
                param_counts[key] = {}
            if value not in param_counts[key]:
                param_counts[key][value] = 0
            param_counts[key][value] += 1
    
    most_common = {}
    for key, value_counts in param_counts.items():
        most_common[key] = max(value_counts, key=value_counts.get)
    
    return most_common

# Build final models with best hyperparameters
final_models = {}
for model_name in top_3_models:
    config = models_config[model_name]
    best_params = get_most_common_params(results[model_name]['best_params'])
    
    # Create final pipeline with best parameters
    final_pipeline = Pipeline([
        ('preprocessor', config['pipeline'].named_steps['preprocessor']),
        ('model', config['pipeline'].named_steps['model'])
    ])
    
    # Set best parameters
    final_pipeline.set_params(**best_params)
    
    print(f"\n{model_name} final parameters: {best_params}")
    final_models[model_name] = final_pipeline

print(f"\nFinal models created: {list(final_models.keys())}")


In [None]:
# Train final models on full training data
print("\nTraining final models on full training set...")
for model_name, model in final_models.items():
    print(f"Training {model_name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start_time
    print(f"  Completed in {fit_time:.2f}s")

print("\nAll models trained successfully!")


## Create Ensemble Model

We'll use a VotingRegressor (averaging predictions) for the ensemble.


In [None]:
# Create ensemble using VotingRegressor
# Note: VotingRegressor needs estimators, but we have pipelines
# We'll create a wrapper or use the models directly

# For ensemble, we need to use the base estimators
# Let's create an ensemble that averages predictions from the three models

class EnsembleModel:
    """Wrapper class to ensemble multiple pipeline models"""
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        for name, model in self.models.items():
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = []
        for name, model in self.models.items():
            pred = model.predict(X)
            predictions.append(pred)
        # Average predictions
        return np.mean(predictions, axis=0)

# Create ensemble
ensemble = EnsembleModel(final_models)
ensemble.fit(X_train, y_train)

print("Ensemble model created and trained!")


In [None]:
# Evaluate ensemble on training data (for reference)
y_train_pred_ensemble = ensemble.predict(X_train)
ensemble_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_ensemble))
ensemble_train_mae = mean_absolute_error(y_train, y_train_pred_ensemble)
ensemble_train_r2 = r2_score(y_train, y_train_pred_ensemble)

print(f"\nEnsemble Performance on Training Data:")
print(f"  RMSE: {ensemble_train_rmse:.4f}")
print(f"  MAE: {ensemble_train_mae:.4f}")
print(f"  R²: {ensemble_train_r2:.4f}")


## Inference: Generate Predictions for Test Data


In [None]:
# Generate predictions on test data
print("Generating predictions on test data...")
test_predictions = ensemble.predict(X_test)

print(f"Predictions shape: {test_predictions.shape}")
print(f"Predictions range: [{test_predictions.min():.2f}, {test_predictions.max():.2f}]")
print(f"Predictions mean: {test_predictions.mean():.2f}")
print(f"Predictions std: {test_predictions.std():.2f}")


In [None]:
# Create submission file
# Get Cattle_ID from test data
if 'Cattle_ID' in test.columns:
    submission = pd.DataFrame({
        'Cattle_ID': test['Cattle_ID'],
        'Milk_Yield_L': test_predictions
    })
else:
    # If no Cattle_ID, create sequential IDs
    submission = pd.DataFrame({
        'Cattle_ID': range(1, len(test_predictions) + 1),
        'Milk_Yield_L': test_predictions
    })

# Save submission
submission_path = 'submission.csv'
submission.to_csv(submission_path, index=False)
print(f"\nSubmission file saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))


## Save Models (Optional)

Save the trained models for future use.


In [None]:
# Save individual final models and ensemble
# Note: CV fold models are already saved during nested_cv_evaluation

for model_name, model in final_models.items():
    model_path = f'models/{model_name.lower()}_final.pkl'
    joblib.dump(model, model_path)
    print(f"Saved {model_name} final model to {model_path}")

ensemble_path = 'models/ensemble_model.pkl'
joblib.dump(ensemble, ensemble_path)
print(f"Saved ensemble to {ensemble_path}")

print("\nAll final models saved successfully!")
print("Note: Cross-validation fold models were saved during CV evaluation.")
