In [13]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score


# Load the data
train_df = pd.read_csv('data-v6/train/train.csv')
test_df = pd.read_csv('data-v6/test/test.csv')
valid_df = pd.read_csv('data-v6/valid/valid.csv')

# Combine train and valid datasets for training
train_df = pd.concat([train_df, valid_df])
train_df = train_df[train_df['pothole_area_mm2'] <= 100000000]
train_df = train_df.dropna()  
test_df = test_df.dropna()

total_train_df = pd.concat([train_df, test_df])

train_df.to_csv('train_features.csv', index=False)

X_train = train_df.drop(columns=['Bags used '])  # Replace 'target' with your target column name
y_train = train_df['Bags used ']
X_test = test_df.drop(columns=['Bags used '])  # Replace 'target' with your target column name
y_test = test_df['Bags used ']

print("Data ready!")

Data ready!


In [8]:
X_train_poly = PolynomialFeatures(degree=1).fit_transform(X_train)
X_test_poly = PolynomialFeatures(degree=1).fit_transform(X_test)

# Define hyperparameter configurations to test
alpha_values = [0.1, 1.0, 10.0, 50.0, 100.0]
degree_values = [1, 2, 3]

# Initialize variables to store the best model
best_model = None
best_train_mse = float('inf')
best_test_mse = float('inf')

# Loop over different configurations
for alpha in alpha_values:
    for degree in degree_values:
        # Create polynomial features
        poly = PolynomialFeatures(degree=degree)
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)

        # Initialize and train the Ridge model
        model = Ridge(alpha=alpha)
        model.fit(X_train_poly, y_train)

        # Predict on training data
        y_train_pred = model.predict(X_train_poly)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_r2 = r2_score(y_train, y_train_pred)

        # Predict on test data
        y_test_pred = model.predict(X_test_poly)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        # Update the best model if this one is better
        if test_mse < best_test_mse:
            best_model = {
                'Model': 'Ridge Regression',
                'Optimal Configuration': {'alpha': alpha, 'degree': degree},
                'Train MSE': train_mse,
                'Train R2': train_r2,
                'Test MSE': test_mse,
                'Test R2': test_r2
            }
            best_train_mse = train_mse
            best_test_mse = test_mse

# Convert the best model's results to a DataFrame
best_model_df = pd.DataFrame([best_model])

print("Best Model Configuration:")
print(best_model_df)

# Optional: Save the best model's results to a CSV file
best_model_df.to_csv('best_ridge_regression_result.csv', index=False)

Best Model Configuration:
              Model        Optimal Configuration  Train MSE  Train R2  \
0  Ridge Regression  {'alpha': 0.1, 'degree': 1}   0.814519  0.316907   

       Test MSE        Test R2  
0  1.847207e+06 -201580.164586  


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [9]:
print(best_model_df)

              Model        Optimal Configuration  Train MSE  Train R2  \
0  Ridge Regression  {'alpha': 0.1, 'degree': 1}   0.814519  0.316907   

       Test MSE        Test R2  
0  1.847207e+06 -201580.164586  


In [15]:
# LINEAR LASSO REGRESSION

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define hyperparameter configurations to test
alpha_values = [0.01, 0.1, 1.0, 10.0]
degree_values = [1, 2, 3]

# Initialize variables to store the best model
best_model = None
best_train_mse = float('inf')
best_test_mse = float('inf')

# Loop over different configurations
for alpha in alpha_values:
    for degree in degree_values:
        # Create polynomial features
        poly = PolynomialFeatures(degree=degree)
        X_train_poly = poly.fit_transform(X_train_scaled)
        X_test_poly = poly.transform(X_test_scaled)

        # Initialize and train the Lasso model
        model = Lasso(alpha=alpha, random_state=42, max_iter=10000)
        model.fit(X_train_poly, y_train)

        # Predict on training data
        y_train_pred = model.predict(X_train_poly)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_r2 = r2_score(y_train, y_train_pred)

        # Predict on test data
        y_test_pred = model.predict(X_test_poly)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        # Update the best model if this one is better
        if test_mse < best_test_mse:
            best_model = {
                'Model': 'Lasso Regression',
                'Optimal Configuration': {'alpha': alpha, 'degree': degree},
                'Train MSE': train_mse,
                'Train R2': train_r2,
                'Test MSE': test_mse,
                'Test R2': test_r2
            }
            best_train_mse = train_mse
            best_test_mse = test_mse

# Append the best model's results to the existing best_model_df
if best_model is not None:
    best_model_df = pd.concat([best_model_df, pd.DataFrame([best_model])], ignore_index=True)


print("Best Model Configuration Appended:")
print(best_model_df)

# Optional: Save the updated best_model_df to a CSV file
best_model_df.to_csv('best_model_df.csv', index=False)



Best Model Configuration Appended:
              Model        Optimal Configuration  Train MSE  Train R2  \
0  Ridge Regression  {'alpha': 0.1, 'degree': 1}   0.814519  0.316907   
1  Lasso Regression  {'alpha': 1.0, 'degree': 1}   1.192399  0.000000   

       Test MSE        Test R2  
0  1.847207e+06 -201580.164586  
1  9.339497e+00      -0.019196  
