In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import classification_report, accuracy_score

In [56]:
# Load data
data = pd.read_csv('all_potholes_cm.csv')

# Handle NaN values
data['Bags used '] = data['Bags used '].fillna(data['Bags used '].median())
data = data.dropna()

In [57]:
# Calculate Z-scores
data['z_score_area'] = stats.zscore(data['pothole_area_cm2'])
data['z_score_bags'] = stats.zscore(data['Bags used '])

In [58]:
# Define Z-score threshold for outliers
z_threshold = 3

# Label outliers
data['is_outlier'] = data['z_score_bags'].abs() > z_threshold

# Prepare features and labels for outlier detection
X_outlier_classification = data[['pothole_area_mm2']]
y_outlier_classification = data['is_outlier']

In [59]:
# Split data into training and testing sets for outlier classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_outlier_classification, y_outlier_classification, test_size=0.2, random_state=42)


# Train an outlier classifier
outlier_classifier = RandomForestClassifier(random_state=42)
outlier_classifier.fit(X_train_class, y_train_class)

# Predict outliers
data['predicted_outlier'] = outlier_classifier.predict(X_outlier_classification)

# Evaluate the classifier
accuracy = accuracy_score(y_outlier_classification, data['predicted_outlier'])
report = classification_report(y_outlier_classification, data['predicted_outlier'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Accuracy: 0.9954441913439636
Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       866
        True       0.83      0.83      0.83        12

    accuracy                           1.00       878
   macro avg       0.92      0.92      0.92       878
weighted avg       1.00      1.00      1.00       878



In [60]:
# Separate outliers and non-outliers
outliers = data[data['predicted_outlier'] == 1]
non_outliers = data[data['predicted_outlier'] == 0]

# Features and targets for non-outliers
X_non_outliers = non_outliers[['pothole_area_cm2']]
y_non_outliers = non_outliers['Bags used ']

In [61]:
# Split into training and testing sets for non-outliers
X_train_non, X_test_non, y_train_non, y_test_non = train_test_split(
    X_non_outliers, y_non_outliers, test_size=0.2, random_state=42)

# Define models to test
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Random Forest': RandomForestRegressor()
}

# Train and evaluate each model
for name, model_non_outliers in models.items():
    # Train a model for non-outliers
    model_non_outliers.fit(X_train_non, y_train_non)

    # Predict Bags used for non-outliers
    non_outlier_predictions = model_non_outliers.predict(X_test_non)

    # Evaluate the non-outlier model
    mae_non = mean_absolute_error(y_test_non, non_outlier_predictions)
    mse_non = mean_squared_error(y_test_non, non_outlier_predictions)
    r2_non = r2_score(y_test_non, non_outlier_predictions)

    print(f"\n{name} Model Performance:")
    print(f"Mean Absolute Error: {mae_non}")
    print(f"Mean Squared Error: {mse_non}")
    print(f"R² Score: {r2_non}")


Linear Regression Model Performance:
Mean Absolute Error: 0.5163226296543371
Mean Squared Error: 0.4625106723087785
R² Score: -0.00017432299954189823

SVR Model Performance:
Mean Absolute Error: 0.45793995358841605
Mean Squared Error: 0.5198765969246317
R² Score: -0.1242275140956357

Gradient Boosting Model Performance:
Mean Absolute Error: 0.38803955181160527
Mean Squared Error: 0.2957236268308798
R² Score: 0.36050085766868356

Random Forest Model Performance:
Mean Absolute Error: 0.35142528735632195
Mean Squared Error: 0.27677437068965516
R² Score: 0.40147841898156356


In [62]:
# Features and targets for outliers
X_outliers = outliers[['pothole_area_cm2']]
y_outliers = outliers['Bags used ']

# Split into training and testing sets for outliers
X_train_out, X_test_out, y_train_out, y_test_out = train_test_split(
    X_outliers, y_outliers, test_size=0.2, random_state=42)

# Train a model for outliers
model_outliers = GradientBoostingRegressor()
model_outliers.fit(X_train_out, y_train_out)

# Predict Bags used for outliers
outlier_predictions = model_outliers.predict(X_test_out)

# Evaluate the outlier model
mae_out = mean_absolute_error(y_test_out, outlier_predictions)
mse_out = mean_squared_error(y_test_out, outlier_predictions)
r2_out = r2_score(y_test_out, outlier_predictions)

print("Outlier Model Performance:")
print(f"Mean Absolute Error: {mae_out}")
print(f"Mean Squared Error: {mse_out}")
print(f"R² Score: {r2_out}")

Outlier Model Performance:
Mean Absolute Error: 2.7669753434045066
Mean Squared Error: 13.565702477418855
R² Score: 0.08115655844387826


In [63]:
# Load the test data
test_data = pd.read_csv('their_test.csv')

# Calculate 'pothole_area_cm2'
test_data['pothole_area_cm2'] = test_data['pothole_area_mm2'] / 1000

# Predict outliers
test_data['predicted_outlier'] = outlier_classifier.predict(test_data[['pothole_area_mm2']])

print(test_data)
# Predict Bags used based on whether they are outliers or not
def predict_bags(row):
    if row['predicted_outlier'] == 1:
        return model_outliers.predict([[row['pothole_area_cm2']]])[0]
    else:
        return model_non_outliers.predict([[row['pothole_area_cm2']]])[0]

test_data['Bags used'] = test_data.apply(predict_bags, axis=1)
test_data['Pothole number'] = test_data['pothole_id']

# Save predictions
test_data[['Pothole number', 'Bags used']].to_csv('their_test_predictions.csv', index=False)

print("Predictions saved to 'their_test_predictions.csv'")

    pothole_id  pothole_area_mm2  pothole_area_cm2  predicted_outlier
0          103      34177.806619         34.177807              False
1          104      32175.824176         32.175824              False
2         1040      10178.471248         10.178471              False
3          105      43517.744288         43.517744              False
4          108      88526.946108         88.526946              False
5         1086       9549.806576          9.549807              False
6         1115       6041.379310          6.041379              False
7         1134       8403.361345          8.403361              False
8          114      42108.291708         42.108292              False
9         1161       3136.645963          3.136646              False
10        1162        794.573946          0.794574              False
11        1181      17981.257143         17.981257              False
12        1198      17786.548530         17.786549              False
13        1205      

