# Regression with a Wild Blueberry Yield Dataset

XGBoost regression model to predict blueberry yields. It loads training and test data, preprocesses features using StandardScaler, trains the model, generates predictions, creates a Kaggle submission file, and saves the model for web deployment. The model uses basic hyperparameters and validates performance using RMSE.

Dataset: https://www.kaggle.com/competitions/playground-series-s3e14/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/blueberry-yield-predictor

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import joblib

def train_model():
    # Load data
    train = pd.read_csv("/kaggle/input/playground-series-s3e14/train.csv")
    test = pd.read_csv("/kaggle/input/playground-series-s3e14/test.csv")
    
    # Store test IDs
    test_ids = test['id']
    
    # Drop ID column and prepare features
    X = train.drop(['id', 'yield'], axis=1)
    y = train['yield']
    X_test = test.drop(['id'], axis=1)
    
    # Split and scale data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    model = XGBRegressor(
        learning_rate=0.1,
        max_depth=3,
        n_estimators=100
    )
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on test set
    test_predictions = model.predict(X_test_scaled)
    
    # Create submission file
    submission = pd.DataFrame({
        'id': test_ids,
        'yield': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    
    # Save model and scaler for web app
    joblib.dump(model, 'model.joblib')
    joblib.dump(scaler, 'scaler.joblib')
    
    # Print validation score
    val_predictions = model.predict(X_val_scaled)
    val_score = np.sqrt(np.mean((y_val - val_predictions) ** 2))
    print(f"Validation RMSE: {val_score:.4f}")

if __name__ == "__main__":
    train_model()

Validation RMSE: 554.3770
