In [None]:
cols_to_drop = ['sheating_energy_eff', 'sheating_env_eff', 'floor_env_eff', 'flat_storey_count', 
                'record_status', 'town_or_city', 'county', 'report_type', 'mechanical_ventilation', 
                'wind_turbine_count', 'ppd_category_type', 'heat_loss_corridor', 'number_open_fireplaces', 
                'unheated_corridor_length']

In [None]:
import pandas as pd
import numpy as np

def preprocess_housing_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform all preprocessing steps: imputation, feature engineering, 
    categorical mapping, and cleaning for housing data.
    """
    
    # --- 1. Handle missing values ---
    # Example: floor_height
    df['floor_height'].replace(0, np.nan, inplace=True)
    df['floor_height_missing'] = df['floor_height'].isna().astype(int)
    df['floor_height'].fillna(df['floor_height'].median(), inplace=True)
    
    # Mains gas
    df['mains_gas_flag'].fillna('Unknown', inplace=True)
    
    # --- 2. Feature Engineering ---
    # Household energy + environmental efficiency
    df['household_energy_efficiency'] = (
        df['windows_energy_eff'] + df['walls_energy_eff'] + df['roof_energy_eff'] +
        df['mainheat_energy_eff'] + df['mainheatc_energy_eff'] + df['lighting_energy_eff']
    )
    
    df['household_environmental_efficiency'] = (
        df['windows_env_eff'] + df['walls_env_eff'] + df['roof_env_eff'] +
        df['mainheat_env_eff'] + df['mainheatc_env_eff'] + df['lighting_env_eff']
    )
    
    df['overall_household_efficiency'] = df['household_energy_efficiency'] + df['household_environmental_efficiency']
    df['efficiency_gap'] = df['household_energy_efficiency'] - df['household_environmental_efficiency']
    
    # Energy cost
    df['current_energy_cost'] = df['lighting_cost_current'] + df['heating_cost_current'] + df['hot_water_cost_current']
    df['potential_energy_cost'] = df['lighting_cost_potential'] + df['heating_cost_potential'] + df['hot_water_cost_potential']
    df['energy_cost_savings'] = df['current_energy_cost'] - df['potential_energy_cost']
    
    # Energy consumption savings
    df['energy_consumption_savings'] = df['energy_consumption_potential'] - df['energy_consumption_current']
    
    # CO2 reduction
    df['co2_emissions'] = df['co2_emissions_potential'] - df['co2_emissions_current']
    
    # --- 3. Map categorical variables ---
    # Duration
    duration_map = {'F':0, 'L':1}
    df['duration_clean'] = df['duration'].map(duration_map)
    
    # Is_new
    is_new_map = {'N':0, 'Y':1}
    df['is_new_clean'] = df['is_new'].map(is_new_map)
    
    # Floor level
    floor_mapping = {
        'ground': 0, 'ground floor': 0, '0': 0, 'nodata!': np.nan, 'no data!': np.nan,
        '1st': 1, '1': 1, '2nd': 2, '2': 2, '3rd': 3, '3': 3, '4th': 4, '4': 4,
        '5th': 5, '5': 5, '6th': 6, '6': 6, '7th': 7, '7': 7, '8th': 8, '8': 8,
        '9th': 9, '9': 9, '10th': 10, '10': 10, '11': 11,
        'mid floor': np.nan, 'top floor': np.nan, 'basement': -1, '-1': -1
    }
    df['floor_level_clean'] = df['floor_level'].astype(str).str.lower().map(lambda x: floor_mapping.get(x, np.nan))

    datetime_cols = ['date_of_transfer', 'lodgement_date', 'inspection_date', 'lodgement_datetime']    
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_weekday'] = df[col].dt.weekday
    
    # difference between inspection and sale in days
    if 'inspection_date' in df.columns and 'date_of_transfer' in df.columns:
        df['days_inspection_to_sale'] = (df['date_of_transfer'] - df['inspection_date']).dt.days
    
    if 'lodgement_date' in df.columns and 'date_of_transfer' in df.columns:
        df['days_lodgement_to_sale'] = (df['date_of_transfer'] - df['lodgement_date']).dt.days
    
    # Property structure & category
    df['property_structure_clean'] = df['property_structure']
    df['property_category_clean'] = df['property_category']

    # Log Transformation of Target Variable - Sale Price
    df['log_sale_price'] = np.log1p(df['sale_price'])
    
    # Drop columns we no longer need
    drop_cols = [
        'sheating_energy_eff', 'sheating_env_eff', 'floor_env_eff', 'flat_storey_count', 
        'record_status', 'town_or_city', 'county', 'report_type', 'mechanical_ventilation', 
        'wind_turbine_count', 'ppd_category_type', 'heat_loss_corridor', 'number_open_fireplaces', 
        'unheated_corridor_length'
    ]
    df.drop(columns=drop_cols, inplace=True, errors='ignore')
    
    return df
