### This notebook will serve to firstly treat the variables that we want to include in the model and start trying out different predictive algorithms. 

In [1]:
import kagglehub
import os
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder 
from sklearn.model_selection import KFold
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
)

In [2]:
# Download latest version
path = kagglehub.dataset_download("shahhet2812/cattle-health-and-feeding-data")

print("Path to dataset files:", path)

# Print the path (you already have it)
print("Path to dataset files:", path)

# List all files in the downloaded directory
files = os.listdir(path)
print("\nFiles in dataset directory:")
for f in files:
    print(f"  {f}")


Path to dataset files: C:\Users\henib\.cache\kagglehub\datasets\shahhet2812\cattle-health-and-feeding-data\versions\1
Path to dataset files: C:\Users\henib\.cache\kagglehub\datasets\shahhet2812\cattle-health-and-feeding-data\versions\1

Files in dataset directory:
  global_cattle_disease_detection_dataset.csv
  global_cattle_milk_yield_prediction_dataset.csv


In [3]:
# Load the CSV file into a DataFrame
cattle = pd.read_csv(os.path.join(path, 'global_cattle_milk_yield_prediction_dataset.csv'))
df = cattle.copy()


In [4]:
df.shape

(250000, 37)

In [5]:
# Getting rid of unnecessary columns 
health_features = ['FMD_Vaccine', 'Brucellosis_Vaccine', 'HS_Vaccine', 'BQ_Vaccine','Anthrax_Vaccine', 'IBR_Vaccine', 'BVD_Vaccine', 'Rabies_Vaccine']
non_relevant_features = ['Date', 'Farm_ID', 'Cattle_ID', 'Previous_Week_Avg_Yield'] #Previous Week Avg Yield serves as a proxy for the target so we must drop otherwise it'll take the highest weight since it's highly correlated with the target
df.drop(columns=health_features+non_relevant_features, inplace=True)

In [6]:
y = df['Milk_Yield_L']
X = df.drop(columns=['Milk_Yield_L'])

In [7]:
# ----------------------------- #
#       Encoding Function       #
# ----------------------------- #

def one_hot_encoding(train_df, test_df, categorical_cols):
    """
    Fit OneHotEncoder on training data and transform both training and test sets.
    Ensures no data leakage and consistent columns between train and test.
    
    Parameters
    ----------
    train_df : pd.DataFrame
        Training DataFrame.
    test_df : pd.DataFrame
        Test DataFrame.
    categorical_cols : list
        List of categorical columns to encode.
        
    Returns
    -------
    X_train_encoded : pd.DataFrame
        Encoded training data.
    X_test_encoded : pd.DataFrame
        Encoded test data.
    encoder : OneHotEncoder
        The fitted encoder (for potential reuse or inverse transform).
    """
    
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
    # Fit on train, transform both
    train_encoded = encoder.fit_transform(train_df[categorical_cols])
    test_encoded  = encoder.transform(test_df[categorical_cols])
    
    # Get encoded feature names
    encoded_cols = encoder.get_feature_names_out(categorical_cols)
    
    # Convert to DataFrame and keep indices aligned
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_cols, index=train_df.index)
    test_encoded_df  = pd.DataFrame(test_encoded, columns=encoded_cols, index=test_df.index)
    
    # Drop original categorical columns and concatenate
    X_train_encoded = pd.concat([train_df.drop(columns=categorical_cols), train_encoded_df], axis=1)
    X_test_encoded  = pd.concat([test_df.drop(columns=categorical_cols), test_encoded_df], axis=1)
    
    return X_train_encoded, X_test_encoded, encoder

# ----------------------------- #
#        Scaling Function       #
# ----------------------------- #

def scale_numerical(train_df, test_df, numerical_cols):
    """
    Fit MinMaxScaler on training numerical columns and transform both training and test sets.
    Prevents data leakage and keeps columns consistent.
    
    Parameters
    ----------
    train_df : pd.DataFrame
        Training DataFrame.
    test_df : pd.DataFrame
        Test DataFrame.
    numerical_cols : list
        List of numerical column names to scale.
        
    Returns
    -------
    X_train_scaled : pd.DataFrame
        Scaled training data.
    X_test_scaled : pd.DataFrame
        Scaled test data.
    scaler : MinMaxScaler
        The fitted scaler (for reuse or inverse transformation).
    """
    
    scaler = MinMaxScaler()
    
    # Fit on train, transform both
    train_scaled = scaler.fit_transform(train_df[numerical_cols])
    test_scaled  = scaler.transform(test_df[numerical_cols])
    
    # Convert back to DataFrames with proper column names and index
    train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_cols, index=train_df.index)
    test_scaled_df  = pd.DataFrame(test_scaled, columns=numerical_cols, index=test_df.index)
    
    # Replace original columns with scaled versions
    X_train_scaled = train_df.copy()
    X_test_scaled  = test_df.copy()
    
    X_train_scaled[numerical_cols] = train_scaled_df
    X_test_scaled[numerical_cols]  = test_scaled_df
    
    return X_train_scaled, X_test_scaled, scaler

# ----------------------------- #
#   Regression CV Function      #
# ----------------------------- #

def cross_validate_regression(model, X, y, categorical_features, numerical_features, n_splits=5):
    """
    Perform K-Fold cross-validation with OneHotEncoder and MinMaxScaler preprocessing
    for regression tasks only.

    Returns R², MAE, MSE, RMSE, and MAPE across folds.
    """

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    metrics = {
        "r2": [],
        "mae": [],
        "mse": [],
        "rmse": [],
        "mape": []
    }

    for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        # Split
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Encode + scale
        X_train_enc, X_test_enc, _ = one_hot_encoding(X_train, X_test, categorical_features)
        X_train_final, X_test_final, _ = scale_numerical(X_train_enc, X_test_enc, numerical_features)

        # Train + predict
        model.fit(X_train_final, y_train)
        preds = model.predict(X_test_final)

        # Compute metrics
        r2 = r2_score(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        mape = mean_absolute_percentage_error(y_test, preds)

        # Save metrics
        metrics["r2"].append(r2)
        metrics["mae"].append(mae)
        metrics["mse"].append(mse)
        metrics["rmse"].append(rmse)
        metrics["mape"].append(mape)

        print(f"Fold {fold}: R²={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}, MAPE={mape:.4f}")

    # Average metrics
    avg_metrics = {k: np.mean(v) for k, v in metrics.items()}

    print("\n✅ Average metrics across folds:")
    for k, v in avg_metrics.items():
        print(f"{k.upper()}: {v:.4f}")

    return avg_metrics, metrics

In [8]:
categorical_features = [
    'Breed', 'Region', 'Country', 'Climate_Zone',
    'Management_System', 'Lactation_Stage', 'Feed_Type', 'Season', 'Milking_Interval_hrs'
]

numerical_features = [
    'Age_Months', 'Weight_kg', 'Feed_Quantity_kg',
    'Water_Intake_L', 'Walking_Distance_km', 'Grazing_Duration_hrs',
    'Rumination_Time_hrs', 'Resting_Hours', 'Ambient_Temperature_C', 'Days_in_Milk', 'Humidity_percent', 'Housing_Score']

### Linear Regression 

In [9]:
from sklearn.linear_model import LinearRegression 
model = LinearRegression()
avg_metrics, all_metrics = cross_validate_regression(
    model=model,
    X=X,
    y=y,
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    n_splits=5
)

Fold 1: R²=0.5906, MAE=2.8355, RMSE=3.6844, MAPE=446307627500792.9375
Fold 2: R²=0.5976, MAE=2.8143, RMSE=3.6613, MAPE=454211984906556.1875
Fold 3: R²=0.5939, MAE=2.8354, RMSE=3.6872, MAPE=465863069597894.4375
Fold 4: R²=0.5929, MAE=2.8233, RMSE=3.6724, MAPE=452285898466502.3125
Fold 5: R²=0.5948, MAE=2.8062, RMSE=3.6568, MAPE=460054564505736.6250

✅ Average metrics across folds:
R2: 0.5940
MAE: 2.8229
MSE: 13.4867
RMSE: 3.6724
MAPE: 455744628995496.5000


### Decision Tree Regressor 

In [10]:
from sklearn.tree import DecisionTreeRegressor

# Initialize model (you can tweak max_depth or other hyperparameters)
model = DecisionTreeRegressor(max_depth=30, random_state=42)

# Run cross-validation
avg_metrics, all_metrics = cross_validate_regression(
    model=model,
    X=X,
    y=y,
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    n_splits=5
)

Fold 1: R²=0.4040, MAE=3.2466, RMSE=4.4457, MAPE=497996561119688.3125
Fold 2: R²=0.4084, MAE=3.2421, RMSE=4.4393, MAPE=485571044813711.2500
Fold 3: R²=0.4037, MAE=3.2569, RMSE=4.4680, MAPE=502393154298185.6250
Fold 4: R²=0.4021, MAE=3.2477, RMSE=4.4507, MAPE=490620062338136.9375
Fold 5: R²=0.4050, MAE=3.2237, RMSE=4.4312, MAPE=493001246965202.4375

✅ Average metrics across folds:
R2: 0.4046
MAE: 3.2434
MSE: 19.7756
RMSE: 4.4470
MAPE: 493916413906984.8750


### Random Forest Regressor 

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
model = RandomForestRegressor(
    n_estimators=100,       # number of trees
    max_depth=None,         # allow trees to expand fully
    min_samples_leaf=100,   # prevent overfitting on leaves
    n_jobs=-1,              # use all CPU cores
    random_state=42
)

# Run cross-validation
avg_metrics, all_metrics = cross_validate_regression(
    model=model,
    X=X,
    y=y,
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    n_splits=3  # fewer folds for faster computation on large dataset
)

Fold 1: R²=0.6452, MAE=2.6445, RMSE=3.4359, MAPE=471019603137895.6875
Fold 2: R²=0.6468, MAE=2.6388, RMSE=3.4327, MAPE=474712644662331.0625
Fold 3: R²=0.6441, MAE=2.6308, RMSE=3.4276, MAPE=467894907766088.9375

✅ Average metrics across folds:
R2: 0.6454
MAE: 2.6380
MSE: 11.7789
RMSE: 3.4320
MAPE: 471209051855438.5625
