Dataset preparation

In [2]:
import pandas as pd
import numpy as np

# Parameters for synthetic data generation
num_houses = 100
days_to_record = 365  # Number of days to record waste collection
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
neighborhood_types = ['Residential', 'Commercial']
weather_conditions = ['Sunny', 'Rainy', 'Cloudy']

# Generate a range of dates
date_range = pd.date_range(start='2023-01-01', periods=days_to_record)

# Generate synthetic data
np.random.seed(42)
data = []

# Assign each house to a neighborhood type
house_neighborhood = {
    house_id: np.random.choice(neighborhood_types, p=[0.8, 0.2])  # 80% residential, 20% commercial
    for house_id in range(1, num_houses + 1)
}

for current_date in date_range:
    day_name = current_date.strftime('%A')
    is_holiday = 1 if day_name in ['Saturday', 'Sunday'] else (1 if np.random.random() < 0.1 else 0)
    weather = np.random.choice(weather_conditions, p=[0.6, 0.2, 0.2])  # 60% Sunny, 20% Rainy, 20% Cloudy
    for house_id in range(1, num_houses + 1):
        neighborhood_type = house_neighborhood[house_id]
        base_waste = 8 if neighborhood_type == 'Residential' else 15  # Avg waste: 8kg for residential, 15kg for commercial
        weather_adjustment = -2 if weather == 'Rainy' else (1 if weather == 'Cloudy' else 0)
        waste_weight = np.random.normal(loc=base_waste + weather_adjustment, scale=2)  # Add variability
        waste_weight = max(waste_weight, 0)  # Ensure no negative values
        data.append({
            'house_id': house_id,
            'date': current_date.strftime('%Y-%m-%d'),
            'day': day_name,
            'isholiday': is_holiday,
            'weather': weather,
            'neighborhood_type': neighborhood_type,
            'waste_weight': waste_weight
        })

# Convert to DataFrame
df = pd.DataFrame(data)
    
# Encoding categorical variables
df['day_encoded'] = df['day'].astype('category').cat.codes
df['neighborhood_encoded'] = df['neighborhood_type'].astype('category').cat.codes
df['weather_encoded'] = df['weather'].astype('category').cat.codes

# Add previous day's waste (lag feature)
df['previous_day_waste'] = df.groupby('house_id')['waste_weight'].shift(1).fillna(df['waste_weight'].mean())



print(df)
# Save to CSV
df.to_csv("enhanced_dataset.csv", index=False)

# Print the first few rows
# print(df.head(10))


       house_id        date     day  isholiday weather neighborhood_type  \
0             1  2023-01-01  Sunday          1   Sunny       Residential   
1             2  2023-01-01  Sunday          1   Sunny        Commercial   
2             3  2023-01-01  Sunday          1   Sunny       Residential   
3             4  2023-01-01  Sunday          1   Sunny       Residential   
4             5  2023-01-01  Sunday          1   Sunny       Residential   
...         ...         ...     ...        ...     ...               ...   
36495        96  2023-12-31  Sunday          1   Sunny       Residential   
36496        97  2023-12-31  Sunday          1   Sunny       Residential   
36497        98  2023-12-31  Sunday          1   Sunny       Residential   
36498        99  2023-12-31  Sunday          1   Sunny       Residential   
36499       100  2023-12-31  Sunday          1   Sunny       Residential   

       waste_weight  day_encoded  neighborhood_encoded  weather_encoded  \
0          5

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Features and target variable
X = df[['house_id', 'day_encoded', 'isholiday', 'neighborhood_encoded', 'weather_encoded', 'previous_day_waste']]
y = df['waste_weight']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

# Function to predict waste for all houses given inputs
def predict_waste_for_today(house_ids, day_encoded, is_holiday, neighborhood_encoded, weather_encoded, previous_day_waste):
    # Create a DataFrame for today's inputs
    today_data = pd.DataFrame({
        'house_id': house_ids,
        'day_encoded': [day_encoded] * len(house_ids),
        'isholiday': [is_holiday] * len(house_ids),
        'neighborhood_encoded': neighborhood_encoded,
        'weather_encoded': [weather_encoded] * len(house_ids),
        'previous_day_waste': previous_day_waste
    })
    # Predict waste for each house
    predictions = model.predict(today_data)
    today_data['predicted_waste_weight'] = predictions
    return today_data

# Example usage
today_day = 'Monday'
today_day_encoded = days_of_week.index(today_day)
is_today_holiday = 0  # Example: not a holiday
today_weather = 'Sunny'
today_weather_encoded = weather_conditions.index(today_weather)

# Neighborhoods and previous day waste
neighborhood_encoded = [house_neighborhood[house_id] == 'Residential' for house_id in range(1, num_houses + 1)]
previous_day_waste = df.groupby('house_id')['waste_weight'].last().values  # Use the last day's waste for each house

house_ids = np.arange(1, num_houses + 1)  # All house IDs
predicted_waste = predict_waste_for_today(
    house_ids, 
    today_day_encoded, 
    is_today_holiday, 
    neighborhood_encoded, 
    today_weather_encoded, 
    previous_day_waste
)

print(predicted_waste.head())


Mean Absolute Error: 1.71
R2 Score: 0.70
   house_id  day_encoded  isholiday  neighborhood_encoded  weather_encoded  \
0         1            0          0                  True                0   
1         2            0          0                 False                0   
2         3            0          0                  True                0   
3         4            0          0                  True                0   
4         5            0          0                  True                0   

   previous_day_waste  predicted_waste_weight  
0            5.162499                8.750476  
1           17.084983               16.698657  
2            9.807065                8.750476  
3            8.038001                8.184376  
4            6.931167                7.081026  


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Features and target variable
X = df[['house_id', 'day_encoded', 'isholiday', 'neighborhood_encoded', 'weather_encoded', 'previous_day_waste']]
y = df['waste_weight']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf')
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MAE': mae, 'R2 Score': r2}
    print(f"{name}:")
    print(f"  Mean Absolute Error: {mae:.2f}")
    print(f"  R2 Score: {r2:.2f}")
    print()

# Compare results
results_df = pd.DataFrame(results).T
print("Model Comparison:\n")
print(results_df)

# Predict waste using the best-performing model (e.g., Random Forest)
best_model = models['Random Forest']  # Replace with the best model based on R² Score or MAE
def predict_waste_for_today(house_ids, day_encoded, is_holiday, neighborhood_encoded, weather_encoded, previous_day_waste):
    today_data = pd.DataFrame({
        'house_id': house_ids,
        'day_encoded': [day_encoded] * len(house_ids),
        'isholiday': [is_holiday] * len(house_ids),
        'neighborhood_encoded': neighborhood_encoded,
        'weather_encoded': [weather_encoded] * len(house_ids),
        'previous_day_waste': previous_day_waste
    })
    predictions = best_model.predict(today_data)
    today_data['predicted_waste_weight'] = predictions
    return today_data

# Example usage
today_day = 'Monday'
today_day_encoded = days_of_week.index(today_day)
is_today_holiday = 0  # Example: not a holiday
today_weather = 'Sunny'
today_weather_encoded = weather_conditions.index(today_weather)

neighborhood_encoded = [house_neighborhood[house_id] == 'Residential' for house_id in range(1, num_houses + 1)]
previous_day_waste = df.groupby('house_id')['waste_weight'].last().values

house_ids = np.arange(1, num_houses + 1)  # All house IDs
predicted_waste = predict_waste_for_today(
    house_ids, 
    today_day_encoded, 
    is_today_holiday, 
    neighborhood_encoded, 
    today_weather_encoded, 
    previous_day_waste
)

# print(predicted_waste.head())


Random Forest:
  Mean Absolute Error: 1.72
  R2 Score: 0.62

Linear Regression:
  Mean Absolute Error: 1.81
  R2 Score: 0.58

Decision Tree:
  Mean Absolute Error: 2.34
  R2 Score: 0.30

Gradient Boosting:
  Mean Absolute Error: 1.61
  R2 Score: 0.67

SVR:
  Mean Absolute Error: 2.05
  R2 Score: 0.45

Model Comparison:

                        MAE  R2 Score
Random Forest      1.720535  0.620613
Linear Regression  1.812317  0.579707
Decision Tree      2.336760  0.296155
Gradient Boosting  1.605859  0.669541
SVR                2.053695  0.452890


In [3]:
import joblib

# Train the Random Forest model


# Save the model to a file
joblib.dump(model, 'random_forest_model.joblib')

# Now you can load the model later using joblib.load() if needed
# model = joblib.load('random_forest_model.joblib')

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")


Mean Absolute Error: 1.72
R2 Score: 0.62
