In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
# Load cleaned data
df = pd.read_csv('D:\waste_management\data\processed\waste_data_cleaned.csv')

Transform skewed numeric features (e.g., Waste Generated, Cost)

In [3]:

for col in ['Waste Generated (Tons/Day)', 'Cost of Waste Management (₹/Ton)']:
    new_col = col + '_log'
    # Add a small constant to avoid log(0)
    df[new_col] = np.log1p(df[col])
    print(f"Transformed {col} with log1p into {new_col}")

Transformed Waste Generated (Tons/Day) with log1p into Waste Generated (Tons/Day)_log
Transformed Cost of Waste Management (₹/Ton) with log1p into Cost of Waste Management (₹/Ton)_log


Interaction Features

In [4]:
# Combine City and Waste Type into a new feature
df['City_WasteType'] = df['City/District'].astype(str) + '_' + df['Waste Type'].astype(str)

# Interaction between Municipal Efficiency and Campaigns
df['Eff_Campaign_Interaction'] = df['Municipal Efficiency Score (1-10)'] * df['Awareness Campaigns Count']

Geospatial Clustering of Landfills to capture regional effects

In [5]:
# Make sure latitude and longitude exist and have no missing values
geo_cols = ['Landfill_Lat', 'Landfill_Long']

if all(col in df.columns for col in geo_cols):
    geo_data = df[geo_cols].dropna()

    # Choose number of clusters (for example k=4)
    kmeans = KMeans(n_clusters=4, random_state=42)
    df.loc[geo_data.index, 'Landfill_Region_Cluster'] = kmeans.fit_predict(geo_data)

    # Convert to category type
    df['Landfill_Region_Cluster'] = df['Landfill_Region_Cluster'].astype('category')
    print("Created Landfill_Region_Cluster categorical feature using KMeans clustering.")



Created Landfill_Region_Cluster categorical feature using KMeans clustering.


Target Encoding for high cardinality categorical: City_WasteType

In [6]:
# Calculate global mean of target for replacement
global_mean = df['Recycling Rate (%)'].mean()

city_waste_target_mean = df.groupby('City_WasteType')['Recycling Rate (%)'].mean()
city_waste_target_count = df.groupby('City_WasteType')['Recycling Rate (%)'].count()

In [7]:
# Define smoothing function to reduce noise for rare categories
def smooth_target_encoding(count, mean, global_mean=global_mean, alpha=10):
    return (count * mean + alpha * global_mean) / (count + alpha)

city_waste_encoded = pd.DataFrame({
    'mean_target': city_waste_target_mean,
    'count': city_waste_target_count
})

city_waste_encoded['smooth_mean'] = city_waste_encoded.apply(
    lambda row: smooth_target_encoding(row['count'], row['mean_target']), axis=1)

In [8]:
# Map encoding back to dataframe
df['City_WasteType_TE'] = df['City_WasteType'].map(city_waste_encoded['smooth_mean'])

Outlier Flags for key numeric features

In [9]:
for col in ['Waste Generated (Tons/Day)', 'Cost of Waste Management (₹/Ton)', 'Population Density (People/km²)']:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    flag_col = col + '_OutlierFlag'
    df[flag_col] = ((df[col] < lower_bound) | (df[col] > upper_bound)).astype(int)
    print(f"Created outlier flag {flag_col} for {col}")

Created outlier flag Waste Generated (Tons/Day)_OutlierFlag for Waste Generated (Tons/Day)
Created outlier flag Cost of Waste Management (₹/Ton)_OutlierFlag for Cost of Waste Management (₹/Ton)
Created outlier flag Population Density (People/km²)_OutlierFlag for Population Density (People/km²)


Encoding of new categorical variables

In [10]:
# For modeling, you may want to one-hot encode Landfill_Region_Cluster or use label encoding
df = pd.get_dummies(df, columns=['Landfill_Region_Cluster'], drop_first=True)

Save the updated data for next modeling stage

In [11]:

df.to_csv('D:\waste_management\data\processed\waste_data_feature_engineered.csv', index=False)
print("Feature engineered dataset saved as 'waste_data_feature_engineered.csv'")

Feature engineered dataset saved as 'waste_data_feature_engineered.csv'
