In [8]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
train_df = pd.read_csv('data-v6/train/train.csv')
test_df = pd.read_csv('data-v6/test/test.csv')
valid_df = pd.read_csv('data-v6/valid/valid.csv')

# Combine train and valid datasets for training
train_df = pd.concat([train_df, valid_df])

train_df = train_df.dropna()  
test_df = test_df.dropna()

total_train_df = pd.concat([train_df, test_df])

train_df.to_csv('train_features.csv', index=False)

In [7]:
X_train = train_df.drop(columns=['Bags used '])  # Replace 'target' with your target column name
y_train = train_df['Bags used ']
X_test = test_df.drop(columns=['Bags used '])  # Replace 'target' with your target column name
y_test = test_df['Bags used ']

In [9]:
# STACKING REGRESSION ENSEMBLE MODEL

base_learners = [
    ('lr', LinearRegression()),
    ('svr', SVR(kernel='rbf', C=0.1, epsilon=0.1)),
    ('knn', KNeighborsRegressor(n_neighbors=10, p=1)),
    ('rf', RandomForestRegressor(n_estimators=50, max_depth=20, min_samples_split=5))
]

meta_learners = {
    'Linear Regression': LinearRegression(),
    'RidgeCV': RidgeCV(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
    'MLP Regressor': MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500),
}

for name, meta_learner in meta_learners.items():
    # Build the stacking regressor
    stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=meta_learner)
    
    # Fit the stacking regressor
    stacking_regressor.fit(X_train, y_train)
    
    # Predict on training data
    y_train_pred = stacking_regressor.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    
    # Predict on test data
    y_test_pred = stacking_regressor.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Print results
    print(f"Meta-Learner: {name}")
    print(f"Training MSE: {train_mse:.4f}, R^2: {train_r2:.4f}")
    print(f"Test MSE: {test_mse:.4f}, R^2: {test_r2:.4f}")
    print("-" * 50)

Meta-Learner: Linear Regression
Training MSE: 0.5826, R^2: 0.5110
Test MSE: 9.9917, R^2: -0.0904
--------------------------------------------------
Meta-Learner: RidgeCV
Training MSE: 0.5650, R^2: 0.5258
Test MSE: 9.9606, R^2: -0.0870
--------------------------------------------------
Meta-Learner: Random Forest
Training MSE: 0.8349, R^2: 0.2992
Test MSE: 10.3480, R^2: -0.1292
--------------------------------------------------
Meta-Learner: Gradient Boosting
Training MSE: 1.1089, R^2: 0.0693
Test MSE: 11.7726, R^2: -0.2847
--------------------------------------------------
Meta-Learner: MLP Regressor
Training MSE: 0.5639, R^2: 0.5267
Test MSE: 10.2171, R^2: -0.1150
--------------------------------------------------


In [13]:
import pandas as pd

# Assuming `stacking_regressor` is the trained model with MLPRegressor as the meta-learner
# Load the new dataset
their_test_df = pd.read_csv('their_test.csv')

# Preprocess the new data if necessary
# Ensure the feature columns in 'their_test.csv' match those used during training
# X_their_test = their_test_df.drop(columns=['pothole_id'])  # Drop the 'Pothole number' column if it exists

# Make predictions
predictions = stacking_regressor.predict(their_test_df)

# Create a DataFrame with 'Pothole number' and 'Bags used'
results_df = pd.DataFrame({
    'Pothole number': their_test_df['pothole_id'],  # Assuming 'Pothole number' is a column in their_test_df
    'Bags used': predictions
})

# Write the results to a .csv file
results_df.to_csv('their_test_predictions.csv', index=False)

print("Predictions have been saved to 'their_test_predictions.csv'")


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- pothole_id
