# Catboost

We train a CatBoost model to predict milk yield.

*   Reverse one-hot encoding to increase categorical variables since CatBoost handles categorical variables efficiently and internally.
*   Randomized hyperparameter search over 1600 combinations
*   3-fold cross validation with early stopping

Catboost is great for structued tabular data, and especially helpful because it can internally encode categorical features.





In [1]:
!pip install -q catboost


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
import pandas as pd
import numpy as np
import time
import tensorflow as tf
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import KFold, RandomizedSearchCV

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import time


### Load and Prepare Data


In [4]:
# Option 1: local files
# train_path = "/content/cleaned_train_data.csv"
# test_path  = "/content/cleaned_test_data.csv"

# OPTION 2: Google Drive/Google Colab
from google.colab import drive
drive.mount("/content/drive")


train_path = "/content/drive/MyDrive/cleaned_train_data.csv"
test_path  = "/content/drive/MyDrive/cleaned_test_data.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape :", test.shape)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train shape: (209926, 41)
Test shape : (40000, 41)


We seperate features/targets, encode object columns, convert boolean columns to integers

In [5]:
TARGET = "Milk_Yield_L"
ID_COL = "Cattle_ID"

X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]

X_test = test.drop(columns=[ID_COL], errors="ignore")

print("\nDtypes before encoding:")
print(X.dtypes)

obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("\nObject columns to encode:", obj_cols)

for col in obj_cols:
    combined = pd.concat([X[col], X_test[col]], axis=0)
    codes, uniques = pd.factorize(combined)
    X[col] = codes[:len(X)]
    X_test[col] = codes[len(X):]

bool_cols = X.select_dtypes(include=["bool"]).columns.tolist()
if bool_cols:
    X[bool_cols] = X[bool_cols].astype(int)
    X_test[bool_cols] = X_test[bool_cols].astype(int)

print("\nDtypes after encoding:")
print(X.dtypes)


Dtypes before encoding:
Age_Months                            int64
Weight_kg                           float64
Parity                                int64
Lactation_Stage                       int64
Days_in_Milk                          int64
Feed_Type                            object
Feed_Quantity_kg                    float64
Feeding_Frequency                     int64
Water_Intake_L                      float64
Walking_Distance_km                 float64
Grazing_Duration_hrs                float64
Resting_Hours                       float64
Ambient_Temperature_C               float64
Humidity_percent                    float64
Housing_Score                       float64
FMD_Vaccine                           int64
Brucellosis_Vaccine                   int64
HS_Vaccine                            int64
BQ_Vaccine                            int64
Anthrax_Vaccine                       int64
IBR_Vaccine                           int64
BVD_Vaccine                           int64
Rabies_

Since CatBoost functions well with categorical data, we reverse some one-hot ecoding and convert breed/management system data into categorical columns. After restructuring, we construct the final feature matrices and align the training data with the test data with the correct shapes and formatting.

In [6]:
train_cb = train.copy()
test_cb  = test.copy()

print("Starting shapes (copies):")
print(train_cb.shape, test_cb.shape)

breed_cols = [
    'Breed_Brown Swiss',
    'Breed_Brown Swiss ',
    'Breed_Guernsey',
    'Breed_Holstein',
    'Breed_Holstien',
    'Breed_Jersey'
]

present_breed_cols = [c for c in breed_cols if c in train_cb.columns]

if len(present_breed_cols) > 0:
    train_cb['Breed'] = train_cb[present_breed_cols].idxmax(axis=1)
    train_cb['Breed'] = train_cb['Breed'].str.replace('Breed_', '', regex=False).str.strip()
    train_cb = train_cb.drop(columns=present_breed_cols)

    test_cb['Breed'] = test_cb[present_breed_cols].idxmax(axis=1)
    test_cb['Breed'] = test_cb['Breed'].str.replace('Breed_', '', regex=False).str.strip()
    test_cb = test_cb.drop(columns=present_breed_cols)

mgmt_cols = [
    'Management_System_Intensive',
    'Management_System_Mixed',
    'Management_System_Pastoral',
    'Management_System_Semi_Intensive'
]

present_mgmt_cols = [c for c in mgmt_cols if c in train_cb.columns]

if len(present_mgmt_cols) > 0:
    train_cb['Management_System'] = train_cb[present_mgmt_cols].idxmax(axis=1)
    train_cb['Management_System'] = train_cb['Management_System'].str.replace('Management_System_', '', regex=False)
    train_cb = train_cb.drop(columns=present_mgmt_cols)

    test_cb['Management_System'] = test_cb[present_mgmt_cols].idxmax(axis=1)
    test_cb['Management_System'] = test_cb['Management_System'].str.replace('Management_System_', '', regex=False)
    test_cb = test_cb.drop(columns=present_mgmt_cols)

X_cat = train_cb.drop(columns=[TARGET, ID_COL])
y_cat = train_cb[TARGET]

X_test_cat = test_cb.drop(columns=[ID_COL], errors="ignore")

print("\nReversed OHE shapes for CatBoost:")
print("X_cat:", X_cat.shape)
print("y_cat:", y_cat.shape)
print("X_test_cat:", X_test_cat.shape)

print("\nCategorical columns for CatBoost:")
print(X_cat.select_dtypes(include=['object']).columns.tolist())


X_cat = train_cb.drop(columns=[TARGET, ID_COL])
y_cat = train_cb[TARGET]

X_test_cat = test_cb.drop(columns=[ID_COL], errors="ignore")

print("\nReversed OHE shapes for CatBoost (before align):")
print("X_cat:", X_cat.shape)
print("y_cat:", y_cat.shape)
print("X_test_cat:", X_test_cat.shape)

X_test_cat = X_test_cat.reindex(columns=X_cat.columns, fill_value=0)

print("\nReversed OHE shapes for CatBoost (after align):")
print("X_cat:", X_cat.shape)
print("X_test_cat:", X_test_cat.shape)

print("\nCategorical columns for CatBoost:")
print(X_cat.select_dtypes(include=['object']).columns.tolist())


Starting shapes (copies):
(209926, 41) (40000, 41)

Reversed OHE shapes for CatBoost:
X_cat: (209926, 31)
y_cat: (209926,)
X_test_cat: (40000, 32)

Categorical columns for CatBoost:
['Feed_Type', 'Breed', 'Management_System']

Reversed OHE shapes for CatBoost (before align):
X_cat: (209926, 31)
y_cat: (209926,)
X_test_cat: (40000, 32)

Reversed OHE shapes for CatBoost (after align):
X_cat: (209926, 31)
X_test_cat: (40000, 31)

Categorical columns for CatBoost:
['Feed_Type', 'Breed', 'Management_System']


We generate a randomized hyperparameter search space, and then take a randomized sample instead of going through the entire search space.

In [7]:
from itertools import product
import random


depth_values          = [5, 6, 7]
learning_rate_values  = [0.04, 0.05, 0.06]
l2_leaf_reg_values    = [3, 5, 7]
bagging_temp_values   = [0.0, 1.0, 2.0]
random_strength_values= [0.5, 1.0, 1.5]

all_combos = list(product(
    depth_values,
    learning_rate_values,
    l2_leaf_reg_values,
    bagging_temp_values,
    random_strength_values,
))

print("Total possible combos:", len(all_combos))

N_CONFIGS = 10

random.seed(42)
random.shuffle(all_combos)
chosen_combos = all_combos[:N_CONFIGS]

param_grid = []
for (depth, lr, l2, bt, rs) in chosen_combos:
    param_grid.append({
        "depth": depth,
        "learning_rate": lr,
        "l2_leaf_reg": l2,
        "bagging_temperature": bt,
        "random_strength": rs,
    })

print(f"Sampling {len(param_grid)} configs from 1600 possible.")


Total possible combos: 243
Sampling 10 configs from 1600 possible.


We use 3-fold cross validation to search for the best hyperparameters. We use Pools to handle different types of features and cv to enable early stopping.

In [8]:
from catboost import Pool, cv
best_params_cat = None
best_rmse_cat = float("inf")

cat_cols = X_cat.select_dtypes(include=["object", "category"]).columns.tolist()
cat_feature_indices = [X_cat.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(
    data=X_cat,
    label=y_cat,
    cat_features=cat_feature_indices,
)

cv_results_list = []

print("\nStarting LARGE CatBoost CV search...\n")

for i, params in enumerate(param_grid, start=1):
    print(f"Config {i}/{len(param_grid)}: {params}")

    cat_params = {
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "iterations": 500,
        "depth": params["depth"],
        "learning_rate": params["learning_rate"],
        "l2_leaf_reg": params["l2_leaf_reg"],
        "bootstrap_type": "Bayesian",
        "bagging_temperature": params["bagging_temperature"],
        "random_strength": params["random_strength"],
        "random_seed": 42,
        "verbose": False,
    }

    start_time = time.time()
    cv_results = cv(
        params=cat_params,
        pool=train_pool,
        fold_count=3,
        shuffle=True,
        partition_random_seed=42,
        verbose=False,
        early_stopping_rounds=100,
    )
    elapsed = time.time() - start_time

    rmse_mean = cv_results["test-RMSE-mean"].iloc[-1]
    rmse_std  = cv_results["test-RMSE-std"].iloc[-1]

    print(f"  CV RMSE: {rmse_mean:.4f} ± {rmse_std:.4f} (time {elapsed:.1f}s)\n")

    cv_results_list.append({
        "params": cat_params,
        "rmse_mean": rmse_mean,
        "rmse_std": rmse_std,
        "time_sec": elapsed,
    })

    if rmse_mean < best_rmse_cat:
        best_rmse_cat = rmse_mean
        best_params_cat = cat_params

print("Best CatBoost CV RMSE:", best_rmse_cat)
print("Best CatBoost params:")
for k, v in best_params_cat.items():
    print(f"  {k}: {v}")



Starting LARGE CatBoost CV search...

Config 1/10: {'depth': 5, 'learning_rate': 0.05, 'l2_leaf_reg': 7, 'bagging_temperature': 0.0, 'random_strength': 0.5}
Training on fold [0/3]

bestTest = 4.103129118
bestIteration = 499

Training on fold [1/3]

bestTest = 4.114352629
bestIteration = 498

Training on fold [2/3]

bestTest = 4.122889194
bestIteration = 496

  CV RMSE: 4.1135 ± 0.0099 (time 167.1s)

Config 2/10: {'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 3, 'bagging_temperature': 1.0, 'random_strength': 1.5}
Training on fold [0/3]

bestTest = 4.101863726
bestIteration = 499

Training on fold [1/3]

bestTest = 4.114209615
bestIteration = 499

Training on fold [2/3]

bestTest = 4.124347091
bestIteration = 498

  CV RMSE: 4.1135 ± 0.0113 (time 198.6s)

Config 3/10: {'depth': 5, 'learning_rate': 0.04, 'l2_leaf_reg': 5, 'bagging_temperature': 0.0, 'random_strength': 0.5}
Training on fold [0/3]

bestTest = 4.104730684
bestIteration = 498

Training on fold [1/3]

bestTest = 4.1166747

We setup the training, validation, and test pools using the best possible parameters.

In [11]:


X_cat_train = X_cat.copy()
y_cat_train = y_cat.copy()
X_cat_test  = X_test_cat.copy()

print("CatBoost Train shape:", X_cat_train.shape)
print("CatBoost Test shape :", X_cat_test.shape)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_cat_train,
    y_cat_train,
    test_size=0.2,
    random_state=42
)

cat_cols = X_tr.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical columns for CatBoost:", cat_cols)
cat_feature_indices = [X_tr.columns.get_loc(c) for c in cat_cols]

train_pool_cat = Pool(
    data=X_tr,
    label=y_tr,
    cat_features=cat_feature_indices,
)

val_pool_cat = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_feature_indices,
)

test_pool_cat = Pool(
    data=X_cat_test,
    cat_features=cat_feature_indices,
)


CatBoost Train shape: (209926, 31)
CatBoost Test shape : (40000, 31)
Categorical columns for CatBoost: ['Feed_Type', 'Breed', 'Management_System']


In [12]:
# best CatBoost params
best_params_cat = {
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "iterations": 1500,
    "depth": 7,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.0,
    "random_strength": 1.0,
    "random_seed": 42,

    "verbose": 100,
    "od_type": "Iter",
    "od_wait": 100
}

We train the final model using the best possible hyperparameters

In [13]:
final_cat = CatBoostRegressor(**best_params_cat)

print("\nTraining final CatBoost model on training split...")
start_time = time.time()
final_cat.fit(
    train_pool_cat,
    eval_set=val_pool_cat,
    use_best_model=True
)

fit_time = time.time() - start_time
print(f"Training completed in {fit_time:.2f} seconds ({fit_time/60:.2f} minutes)")

# check for overfitting
y_tr_pred  = final_cat.predict(train_pool_cat)
y_val_pred = final_cat.predict(val_pool_cat)

rmse_train = mean_squared_error(y_tr, y_tr_pred) ** 0.5
rmse_val   = mean_squared_error(y_val, y_val_pred) ** 0.5

print(f"Train RMSE: {rmse_train:.4f}")
print(f"Valid RMSE: {rmse_val:.4f}")

y_test_pred_cat = final_cat.predict(test_pool_cat)


Training final CatBoost model on training split...
0:	learn: 5.2680538	test: 5.2870592	best: 5.2870592 (0)	total: 163ms	remaining: 4m 5s
100:	learn: 4.1764583	test: 4.1956821	best: 4.1956821 (100)	total: 23.6s	remaining: 5m 26s
200:	learn: 4.1013740	test: 4.1295347	best: 4.1295347 (200)	total: 48.5s	remaining: 5m 13s
300:	learn: 4.0729761	test: 4.1137656	best: 4.1137635 (296)	total: 1m 5s	remaining: 4m 19s
400:	learn: 4.0536182	test: 4.1103752	best: 4.1103595 (387)	total: 1m 20s	remaining: 3m 41s
500:	learn: 4.0383475	test: 4.1099861	best: 4.1098064 (472)	total: 1m 36s	remaining: 3m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 4.109806429
bestIteration = 472

Shrink model to first 473 iterations.
Training completed in 110.07 seconds (1.83 minutes)
Train RMSE: 4.0435
Valid RMSE: 4.1098


Create and save our submission

In [14]:
print("Generating CatBoost predictions on test data...")
test_predictions = y_test_pred_cat  # Already computed earlier

print(f"Predictions shape: {test_predictions.shape}")
print(f"Range: [{test_predictions.min():.2f}, {test_predictions.max():.2f}]")
print(f"Mean: {test_predictions.mean():.2f}")
print(f"Std: {test_predictions.std():.2f}")

if 'Cattle_ID' in test.columns:
    submission = pd.DataFrame({
        'Cattle_ID': test['Cattle_ID'],
        'Milk_Yield_L': test_predictions
    })
else:
    submission = pd.DataFrame({
        'Cattle_ID': range(1, len(test_predictions) + 1),
        'Milk_Yield_L': test_predictions
    })

submission_path = 'submission_catboost.csv'
submission.to_csv(submission_path, index=False)

print(f"\nSubmission file saved to: {submission_path}")
print("Submission shape:", submission.shape)
print("\nSample predictions:")
print(submission.head(10))


Generating CatBoost predictions on test data...
Predictions shape: (40000,)
Range: [4.93, 28.02]
Mean: 15.60
Std: 3.36

Submission file saved to: submission_catboost.csv
Submission shape: (40000, 2)

Sample predictions:
   Cattle_ID  Milk_Yield_L
0          1     19.334382
1          2     10.779332
2          3     22.833837
3          4     15.042348
4          5     18.337105
5          6     19.596349
6          7     15.494489
7          8     18.232735
8          9     22.497323
9         10     14.370754
