# Neural Network (MLP) for Cattle Milk Yield Prediction

This notebook trains a fully connected neural network (MLP) to predict milk yield from a cleaned feature set.



*   Standardized feature matrix with z score normalization
*   Uses EarlyStopping and ReduceLROnPlateue to optimize training duration and learning rate
*   80/20 train-validation split
*   Retrains final model on full dataset with optimal epoch count

Since neural networks can easily overfit, especially on tabular data, we implemented regularization and validation monitoring to ensure model stability and predictive accuracy.



In [4]:
import pandas as pd
import numpy as np
import time
import tensorflow as tf

from sklearn.model_selection import KFold, RandomizedSearchCV
from xgboost import XGBRegressor

### Load and Prepare Data


In [5]:
# Option 1: local files
# train_path = "/content/cleaned_train_data.csv"
# test_path  = "/content/cleaned_test_data.csv"

# OPTION 2: Google Drive/Google Colab
from google.colab import drive
drive.mount("/content/drive")


train_path = "/content/drive/MyDrive/ml final/cleaned_train_data.csv"
test_path  = "/content/drive/MyDrive/ml final/cleaned_test_data.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape :", test.shape)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train shape: (209926, 41)
Test shape : (40000, 41)


We seperate features/targets, encode object columns, convert boolean columns to integers

In [6]:
TARGET = "Milk_Yield_L"
ID_COL = "Cattle_ID"

X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]

X_test = test.drop(columns=[ID_COL], errors="ignore")

print("\nDtypes before encoding:")
print(X.dtypes)

obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("\nObject columns to encode:", obj_cols)

for col in obj_cols:
    combined = pd.concat([X[col], X_test[col]], axis=0)
    codes, uniques = pd.factorize(combined)
    X[col] = codes[:len(X)]
    X_test[col] = codes[len(X):]

bool_cols = X.select_dtypes(include=["bool"]).columns.tolist()
if bool_cols:
    X[bool_cols] = X[bool_cols].astype(int)
    X_test[bool_cols] = X_test[bool_cols].astype(int)

print("\nDtypes after encoding:")
print(X.dtypes)


Dtypes before encoding:
Age_Months                            int64
Weight_kg                           float64
Parity                                int64
Lactation_Stage                       int64
Days_in_Milk                          int64
Feed_Type                            object
Feed_Quantity_kg                    float64
Feeding_Frequency                     int64
Water_Intake_L                      float64
Walking_Distance_km                 float64
Grazing_Duration_hrs                float64
Resting_Hours                       float64
Ambient_Temperature_C               float64
Humidity_percent                    float64
Housing_Score                       float64
FMD_Vaccine                           int64
Brucellosis_Vaccine                   int64
HS_Vaccine                            int64
BQ_Vaccine                            int64
Anthrax_Vaccine                       int64
IBR_Vaccine                           int64
BVD_Vaccine                           int64
Rabies_

### Neural Network

We setup numpy arrays and apply feature standardization

In [7]:

X_test_nn_df = X_test[X.columns]
X_nn = X.astype("float32").values
y_nn = np.asarray(y, dtype="float32")
X_test_nn = X_test_nn_df.astype("float32").values

# standardize features
feat_mean = X_nn.mean(axis=0, keepdims=True)
feat_std = X_nn.std(axis=0, keepdims=True) + 1e-8

X_nn = (X_nn - feat_mean) / feat_std
X_test_nn = (X_test_nn - feat_mean) / feat_std

input_dim = X_nn.shape[1]
print("Input dim:", input_dim)



Input dim: 39


We setup an 80/20 train & validation split

In [8]:
rng = np.random.default_rng(42)
indices = rng.permutation(len(X_nn))
val_frac = 0.2
val_size = int(len(X_nn) * val_frac)

val_idx = indices[:val_size]
train_idx = indices[val_size:]

X_train_nn, X_val_nn = X_nn[train_idx], X_nn[val_idx]
y_train_nn, y_val_nn = y_nn[train_idx], y_nn[val_idx]

print("Train samples:", X_train_nn.shape[0])
print("Val samples  :", X_val_nn.shape[0])



Train samples: 167941
Val samples  : 41985


We setup a fully-connected multi-layer perceptron which stacks dense layers with RELU activations to learn the nonlinear layers, while the dropout and batch normalizations are used to reduce overfitting.



In [9]:

def build_mlp(input_dim,
              hidden_units=(256, 128, 64),
              dropout=0.2,
              lr=1e-3):
    inputs = tf.keras.Input(shape=(input_dim,))
    x = inputs
    for units in hidden_units:
        x = tf.keras.layers.Dense(units, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)
    outputs = tf.keras.layers.Dense(1, activation="linear")(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse")],
    )
    return model



Initialize model and set hyperparameters/callbacks to improve training stability.

In [10]:

model = build_mlp(
    input_dim=input_dim,
    hidden_units=(256, 128, 64),
    dropout=0.2,
    lr=1e-3,
)

model.summary()

batch_size = 1024
epochs = 50

callbacks = [
    # used to stop training when RMSE stops improving as to reduce overfitting
    tf.keras.callbacks.EarlyStopping(
        monitor="val_rmse",
        patience=5,
        restore_best_weights=True,
        verbose=1,
    ),
    # reduces learning rate when progress is slowed
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_rmse",
        factor=0.5,
        patience=3,
        min_lr=1e-5,
        verbose=1,
    ),
]

print("Training NN...")
start_time = time.time()

history = model.fit(
    X_train_nn,
    y_train_nn,
    validation_data=(X_val_nn, y_val_nn),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=1,
)

elapsed_min = (time.time() - start_time) / 60
print(f"\nNN training finished in {elapsed_min:.2f} minutes.")

val_loss, val_rmse = model.evaluate(X_val_nn, y_val_nn, verbose=0)
print(f"\nValidation RMSE (NN): {val_rmse:.4f}")

Training NN...
Epoch 1/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - loss: 247.0498 - rmse: 15.7106 - val_loss: 146.1953 - val_rmse: 12.0911 - learning_rate: 0.0010
Epoch 2/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 113.1106 - rmse: 10.5887 - val_loss: 30.9410 - val_rmse: 5.5625 - learning_rate: 0.0010
Epoch 3/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 29.2592 - rmse: 5.4044 - val_loss: 19.4142 - val_rmse: 4.4062 - learning_rate: 0.0010
Epoch 4/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 23.1953 - rmse: 4.8161 - val_loss: 19.0851 - val_rmse: 4.3686 - learning_rate: 0.0010
Epoch 5/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 22.6277 - rmse: 4.7567 - val_loss: 18.8459 - val_rmse: 4.3412 - learning_rate: 0.0010
Epoch 6/50
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6

We find the optimal epoch episode number right before EarlyStopping was called and retrain a model on the entire dataset with that epoch number.

In [11]:

best_epochs = len(history.history["rmse"])

print(f"\nTraining final NN on all data for {best_epochs} epochs...")
final_model_nn = build_mlp(
    input_dim=input_dim,
    hidden_units=(256, 128, 64),
    dropout=0.2,
    lr=1e-3,
)

final_model_nn.fit(
    X_nn,
    y_nn,
    epochs=best_epochs,
    batch_size=batch_size,
    verbose=1,
)




Training final NN on all data for 50 epochs...
Epoch 1/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 238.9706 - rmse: 15.4451
Epoch 2/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 73.0874 - rmse: 8.4925
Epoch 3/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 23.8789 - rmse: 4.8863
Epoch 4/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 22.2230 - rmse: 4.7141
Epoch 5/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 21.6425 - rmse: 4.6521
Epoch 6/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 21.0069 - rmse: 4.5833
Epoch 7/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 20.5897 - rmse: 4.5376
Epoch 8/50
[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 20.2929 - rmse: 4.5047
Epoch 9/50
[

<keras.src.callbacks.history.History at 0x79009018c4a0>

We produce and save our submission

In [12]:

nn_train_preds = final_model_nn.predict(X_nn, batch_size=1024).squeeze()
print("  NN Train RMSE:       ", np.sqrt(np.mean((y - nn_train_preds)**2)))


test_preds_nn = final_model_nn.predict(X_test_nn, batch_size=batch_size).squeeze()

nn_submission = pd.DataFrame({
    "Cattle_ID": test["Cattle_ID"],
    "Milk_Yield_L": test_preds_nn,
})

nn_submission_path = "/content/nn_submission.csv"
nn_submission.to_csv(nn_submission_path, index=False)

print(f"\nSaved NN submission to: {nn_submission_path}")
nn_submission.head()

[1m206/206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
  NN Train RMSE:        4.17654700219103
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step

Saved NN submission to: /content/nn_submission.csv


Unnamed: 0,Cattle_ID,Milk_Yield_L
0,1,18.925587
1,2,10.408438
2,3,22.020819
3,4,14.712931
4,5,17.246048
