In [8]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the data
train_df = pd.read_csv('train_area_features.csv')
train_df = train_df.dropna(subset=['Bags'])
X=train_df[['Area']]
y = train_df['Bags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



train_df.to_csv('train_features.csv', index=False)

In [9]:
# STACKING REGRESSION ENSEMBLE MODEL

base_learners = [
    ('lr', LinearRegression()),
    ('svr', SVR(kernel='rbf', C=0.1, epsilon=0.1)),
    ('knn', KNeighborsRegressor(n_neighbors=10, p=1)),
    ('rf', RandomForestRegressor(n_estimators=50, max_depth=20, min_samples_split=5))
]

meta_learners = {
    'Linear Regression': LinearRegression(),
    'RidgeCV': RidgeCV(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
    'MLP Regressor': MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500),
}

valid_range = np.arange(0.05, 6.05, 0.05)

# Function to snap predictions to the nearest valid value
def snap_to_nearest_valid(predictions, valid_range):
    return np.array([valid_range[np.abs(valid_range - pred).argmin()] for pred in predictions])


for name, meta_learner in meta_learners.items():
    # Build the stacking regressor
    stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=meta_learner)
    
    # Fit the stacking regressor
    stacking_regressor.fit(X_train, y_train)
    
    # Predict on training data
    y_train_pred = stacking_regressor.predict(X_train)
    y_train_pred_snapped = snap_to_nearest_valid(y_train_pred, valid_range)
    train_mse = mean_squared_error(y_train, y_train_pred_snapped)
    train_r2 = r2_score(y_train, y_train_pred_snapped)
    
    # Predict on test data
    y_test_pred = stacking_regressor.predict(X_test)
    y_test_pred_snapped = snap_to_nearest_valid(y_test_pred, valid_range)
    test_mse = mean_squared_error(y_test, y_test_pred_snapped)
    test_r2 = r2_score(y_test, y_test_pred_snapped)
    
    # Print results
    print(f"Meta-Learner: {name}")
    print(f"Training MSE: {train_mse:.4f}, R^2: {train_r2:.4f}")
    print(f"Test MSE: {test_mse:.4f}, R^2: {test_r2:.4f}")
    print("-" * 50)

Meta-Learner: Linear Regression
Training MSE: 1.6339, R^2: 0.0075
Test MSE: 0.4673, R^2: -0.2381
--------------------------------------------------
Meta-Learner: RidgeCV
Training MSE: 1.5875, R^2: 0.0357
Test MSE: 0.4593, R^2: -0.2169
--------------------------------------------------
Meta-Learner: Random Forest
Training MSE: 1.6082, R^2: 0.0231
Test MSE: 0.5675, R^2: -0.5035
--------------------------------------------------
Meta-Learner: Gradient Boosting
Training MSE: 1.7322, R^2: -0.0522
Test MSE: 0.6579, R^2: -0.7431
--------------------------------------------------
Meta-Learner: MLP Regressor
Training MSE: 1.8012, R^2: -0.0942
Test MSE: 0.4685, R^2: -0.2414
--------------------------------------------------


In [10]:
import pandas as pd

# Assuming `stacking_regressor` is the trained model with MLPRegressor as the meta-learner
# Load the new dataset
their_test_df = pd.read_csv('test_area_features.csv')

# Preprocess the new data if necessary
# Ensure the feature columns in 'their_test.csv' match those used during training
# X_their_test = their_test_df.drop(columns=['pothole_id'])  # Drop the 'Pothole number' column if it exists

# Make predictions
predictions = stacking_regressor.predict(their_test_df[['Area']])

# Create a DataFrame with 'Pothole number' and 'Bags used'
results_df = pd.DataFrame({
    'Pothole number': their_test_df['ID'],  # Assuming 'Pothole number' is a column in their_test_df
    'Bags used': predictions
})

# Write the results to a .csv file
results_df.to_csv('their_test_predictions.csv', index=False)

print("Predictions have been saved to 'their_test_predictions.csv'")

Predictions have been saved to 'their_test_predictions.csv'
