In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [26]:
df = pd.read_csv("renovation_cost_dataset_india_v2.csv") 

In [27]:
df["As_Of_Date"] = pd.to_datetime(df["As_Of_Date"], dayfirst=True)

In [28]:
df = df.drop(columns=["Material_Price_Index"])

In [29]:
# Step 2: Encode City_Tier
# ------------------------
tier_map = {"Metro": 3, "Tier-2": 2, "Tier-3": 1}
df["City_Tier"] = df["City_Tier"].map(tier_map)

In [30]:
# Step 3: Boolean to Numeric
# ------------------------
# Convert to string, uppercase, strip spaces
df["Has_Electrical"] = df["Has_Electrical"].astype(str).str.upper().str.strip()

# Map cleanly
df["Has_Electrical"] = df["Has_Electrical"].map({
    "TRUE": 1,
    "FALSE": 0
})

# If anything still missing, fill with 0 (no electrical)
df["Has_Electrical"] = df["Has_Electrical"].fillna(0).astype(int)

In [31]:
# Step 4: Replace None / Missing Values
# ------------------------
df = df.fillna("NA")              # categorical missing values
cost_cols = [col for col in df.columns if "Cost" in col or "Total" in col]
df[cost_cols] = df[cost_cols].replace("None", 0).astype(float)

In [32]:
# Step 5: Encode Categorical Columns
# ------------------------
cat_cols = ["City", "Room_Type", "Renovation_Level", "Paint_Quality",
            "Floor_Type", "Floor_Quality", "Ceiling_Type", "Ceiling_Quality",
            "Furniture_Level", "Kitchen_Package", "Bathroom_Package"]

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))


In [34]:
# Save Outputs
# ------------------------
df.to_csv("renovation_preprocessed.csv", index=False)