<a href="https://colab.research.google.com/github/anushka827/model-1/blob/main/Stacking_Regressor_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split # Added for potential future use or validation

# Step 1: Load the datasets
# Load training data from 'train.csv'
train_df = pd.read_csv('train.csv')
# Load test data from 'test.csv'
test_df = pd.read_csv('test.csv')

# Step 2: Define features and target variable
# These are the columns used as input for the model
features = ['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
            'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
            'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
            'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds']
# This is the column the model will try to predict
target = 'output'

# Separate features (X) and target (y) for the training set
X_train = train_df[features]
y_train = train_df[target]

# Prepare the features for the test set (no 'output' column in test data)
X_test = test_df[features]

# Step 3: Define Base Models for Stacking
# We'll use a few different types of regressors as our base learners.
# These models will make predictions, and their predictions will then be fed into the final estimator.
estimators = [
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('ridge', Ridge(random_state=42))
]

# Step 4: Initialize and train the StackingRegressor model
# The StackingRegressor takes a list of (name, estimator) tuples for its base models.
# final_estimator: This is the meta-model that learns to combine the predictions of the base estimators.
# cv: Determines the cross-validation splitting strategy. 5-fold cross-validation is a good default.
# n_jobs: Number of jobs to run in parallel (-1 means use all available processors).
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(random_state=42), # Using Ridge as the meta-model
    cv=5, # 5-fold cross-validation for generating meta-features
    n_jobs=-1 # Use all available CPU cores for parallel processing
)

print("Training Stacking Regressor model...")
# Train the stacking model using the training data
stacking_model.fit(X_train, y_train)
print("Stacking Regressor model training complete.")

# Step 5: Make predictions on the training data
# These predictions are used to evaluate how well the model learned from the data it was trained on.
y_train_pred_stack = stacking_model.predict(X_train)

# Step 6: Evaluate the model on the training data
# Calculate common regression metrics:
# MAE (Mean Absolute Error)
# MSE (Mean Squared Error)
# RMSE (Root Mean Squared Error)
# R2 (R-squared)
mae_train_stack = mean_absolute_error(y_train, y_train_pred_stack)
mse_train_stack = mean_squared_error(y_train, y_train_pred_stack)
rmse_train_stack = mse_train_stack**0.5
r2_train_stack = r2_score(y_train, y_train_pred_stack)

# Print the evaluation metrics for the training set
print("\nStacking Regressor Training Metrics:")
print(f"Mean Absolute Error (MAE): {mae_train_stack:.2f}")
print(f"Mean Squared Error (MSE): {mse_train_stack:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_train_stack:.2f}")
print(f"R-squared (R2): {r2_train_stack:.2f}")

# Step 7: Make predictions on the test data
# These are predictions on unseen data.
y_test_pred_stack = stacking_model.predict(X_test)

# Create a DataFrame to store the test predictions
test_predictions_stack_df = pd.DataFrame({
    'id': test_df['id'],
    'predicted_output_stack': y_test_pred_stack
})

# Display the first few rows of the generated test predictions
print("\nStacking Regressor Test Predictions Head:")
print(test_predictions_stack_df.head())

# Save the test predictions to a new CSV file
test_predictions_stack_df.to_csv('test_predictions_stacking.csv', index=False)
print("\nTest predictions saved to 'test_predictions_stacking.csv'")

Training Stacking Regressor model...
Stacking Regressor model training complete.

Stacking Regressor Training Metrics:
Mean Absolute Error (MAE): 193.87
Mean Squared Error (MSE): 84793.81
Root Mean Squared Error (RMSE): 291.19
R-squared (R2): 0.95

Stacking Regressor Test Predictions Head:
   id  predicted_output_stack
0   1             4225.726124
1   2             3456.484551
2   3             6365.235795
3   4             4189.143457
4   5             7386.185374

Test predictions saved to 'test_predictions_stacking.csv'
