# Hyderabad House Price Pre-procesing

In [None]:
import pandas as pd

PRICE = "Price"
LOCATION = "Location"
REFRIGERATOR = "Refrigerator"
SECURITY = "24X7Security"
AREA = "Area"
BED_ROOMS = "No. of Bedrooms"

house_prices_ds = pd.read_csv('../../../datasets/raw/hyderabad_house_price_with_nullables.csv')
labels_ds = house_prices_ds[PRICE]
features_ds = house_prices_ds.drop(columns=[PRICE])
numeric_features_df = features_ds.select_dtypes(include=["int64", "float64"])
categorical_features_df = features_ds[LOCATION]
processed_ds = None

def convertArrayToDataset(dataset):
    return pd.DataFrame(
    dataset,
    columns=dataset.columns,
    index=dataset.index)

def exportDataset(dataset, step):
    dataset.to_csv(f"../../../datasets/processed/hyderabad_house_price_{step}.csv")

## 1.0 Missing Values

Replacing missing numerical values with the most frequent values (mode).

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class WindowImputer(BaseEstimator, TransformerMixin):
    """
    Imputa Column usando vecinos con Label column en [Label - window, Label + window].
    Si no hay vecinos, expande la ventana (x2) hasta max_expansions veces.
    Fallback: mediana global de AREA.
    """
    def __init__(self, column="Area", label_col="Price", window=1000.0,
                 agg="median", expand=True, max_expansions=2):
        self.column = column
        self.label_col = label_col
        self.window = float(window)
        self.agg = agg
        self.expand = expand
        self.max_expansions = int(max_expansions)

    def fit(self, X, y=None):
        df = X.copy()
        df[self.label_col] = pd.to_numeric(df[self.label_col], errors="coerce")
        df[self.column]  = pd.to_numeric(df[self.column],  errors="coerce")
        self._train_ = df[[self.label_col, self.column]].copy()

        if self.agg == "mean":
            self._aggfunc_ = np.nanmean
        else:
            self._aggfunc_ = np.nanmedian

        self.global_fallback_ = float(self._aggfunc_(self._train_[self.column].values))
        return self

    def transform(self, X):
        df = X.copy()
        df[self.label_col] = pd.to_numeric(df[self.label_col], errors="coerce")
        df[self.column]  = pd.to_numeric(df[self.column],  errors="coerce")

        na_idx = df[self.column].isna()
        if not na_idx.any():
            return df

        for idx, p in df.loc[na_idx, self.label_col].items():
            # Si no hay Price para esta fila, fallback global.
            if pd.isna(p):
                df.at[idx, self.column] = self.global_fallback_
                continue

            w = self.window
            imputed = None
            for _ in range(self.max_expansions + 1):
                low, high = p - w, p + w
                mask = (
                    self._train_[self.label_col].between(low, high)
                    & self._train_[self.column].notna()
                )
                candidates = self._train_.loc[mask, self.column]
                if len(candidates):
                    imputed = float(self._aggfunc_(candidates.values))
                    break
                if not self.expand:
                    break
                w *= 2  # expandimos la ventana

            if imputed is None:
                imputed = self.global_fallback_
            df.at[idx, self.column] = imputed

        return df
    


In [None]:
from sklearn.impute import SimpleImputer

df = house_prices_ds.copy()

# Impute the REFRIGERATOR and SECURITY with the most frequent value
mf_imputer = SimpleImputer(strategy="most_frequent")
mf_cols = [REFRIGERATOR, SECURITY]
df[mf_cols] = mf_imputer.fit_transform(df[mf_cols])

# Impute Area with the Custom Window Imputer
area_imputer = WindowImputer(column=AREA, label_col=PRICE, window=1000, agg="median")
df = area_imputer.fit_transform(df)

# Impute Bed rooms with the Custom Window Imputer
bed_imputer = WindowImputer(column=BED_ROOMS, label_col=PRICE, window=1000, agg="median")
imputed_features_df = bed_imputer.fit_transform(df)

# Write processed dataset
processed_ds = convertArrayToDataset(imputed_features_df)
exportDataset(processed_ds, "1_nullables")

print("Total Missing values:", processed_ds.isna().sum().sum())
processed_ds.isnull().sum()


## 2.0 Transform Categorical Data

Convert categorical features to Dummy variables

In [None]:
## Transform Categorical Data
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = house_prices_ds.copy()

# Create encoder
encoder = OneHotEncoder(sparse_output=True, drop=None, handle_unknown="ignore")

# Set and transform Location
location_encoded = encoder.fit_transform(df[["Location"]])


# Alternative: use dummy variables when there are few categories 
# house_prices_ds_encoded = pd.get_dummies(processed_ds, columns=["Location"])

categorical_features_df = pd.DataFrame(
    location_encoded.toarray(),
    columns=encoder.get_feature_names_out([LOCATION]),
    index=house_prices_ds.index
)


processed_ds = pd.concat(
    [processed_ds.drop(columns=[LOCATION]), categorical_features_df],
    axis=1
)

exportDataset(processed_ds, "2_categorical")

processed_ds.head()



## 3.0 Fix Outiers values


In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class BinaryRangeImputer(BaseEstimator, TransformerMixin):
    """
    Reemplaza valores fuera de [0,1] en columnas binarias por el valor más frecuente de esa columna.
    """
    def __init__(self, columns=None, valid_range=(0,1)):
        self.columns = columns
        self.valid_range = valid_range
        self.fill_values_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        for col in self.columns:
            # valores válidos
            mask_valid = df[col].between(self.valid_range[0], self.valid_range[1])
            mode_val = df.loc[mask_valid, col].mode(dropna=True)
            # fallback: si no hay válidos, usa 0
            self.fill_values_[col] = mode_val.iloc[0] if not mode_val.empty else 0
        return self

    def transform(self, X):
        df = X.copy()
        for col in self.columns:
            mask_invalid = ~df[col].between(self.valid_range[0], self.valid_range[1])
            df.loc[mask_invalid, col] = self.fill_values_[col]
        return df


In [None]:
binary_cols = ["MaintenanceStaff", "Gymnasium","SwimmingPool","LandscapedGardens","JoggingTrack","RainWaterHarvesting",
        "IndoorGames","ShoppingMall","Intercom","SportsFacility","ATM","ClubHouse","School",
        "24X7Security","PowerBackup","CarParking","StaffQuarter","Cafeteria","MultipurposeRoom",
        "Hospital","WashingMachine","Gasconnection","AC","Wifi","Children'splayarea",
        "LiftAvailable","BED","VaastuCompliant","Microwave","GolfCourse","TV","DiningTable",
        "Sofa","Wardrobe","Refrigerator"]


mask_invalid_before = (processed_ds[binary_cols] < 0) | (processed_ds[binary_cols] > 1)
rows_invalid_before = processed_ds[mask_invalid_before.any(axis=1)]
print("Invalid binary features before:", len(rows_invalid_before[binary_cols]))

# 1. Transform invalid binary values by most frecuent
imputer = BinaryRangeImputer(columns=binary_cols, valid_range=(0,1))
binary_impute = imputer.fit_transform(processed_ds)
exportDataset(processed_ds, "3_outiers")

mask_invalid_after = (binary_impute[binary_cols] < 0) | (binary_impute[binary_cols] > 1)
rows_invalid_after = binary_impute[mask_invalid_after.any(axis=1)]
print("Invalid binary features after:", len(rows_invalid_after[binary_cols]))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


for col in binary_cols:
    sns.boxplot(x=binary_impute[col])
    plt.title(f"Boxplot de {col}")
    plt.show()

## 4.0 Checking multicollinearity

# 4.1 Variance Inflation Factor

# VIF = 1 → No colinealidad.
# VIF entre 1 y 5 → Aceptable.
# VIF > 10 → Problema serio de multicolinealidad.

In [None]:
# Variance Inflation Factor

# VIF = 1 → No colinealidad.
# VIF entre 1 y 5 → Aceptable.
# VIF > 10 → Problema serio de multicolinealidad.

from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

df = processed_ds.copy()    

# Selecciona solo las variables numéricas relevantes (sin la target)
X = df.select_dtypes(include=["int64", "float64"]).drop(columns=[LOCATION, PRICE], errors="ignore")

# Calcular VIF para cada columna
# VIF = 1: No multicollinearity.
# VIF between 1 and 5: Moderate multicollinearity.
# VIF > 5 or 10: High multicollinearity (depending on the analyst's criteria).
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]



print(vif_data)

## 4.2 Split training, and validation, and test datasets

## 4.2 Fixing Multicollinearity

# Eliminate redundant variables (such as X2 if it is derived from X1).
# Combine correlated variables (for example, using PCA).
# Apply regularization (Ridge or Lasso).
# Review the model structure and consider transformations.

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from enum import Enum

class ModelType(Enum):
    PCA = "PCA"
    SVD = "SVD"
    LDA = "LDA"

# Función de regresión lineal
def lineal_regresion_model_apply(x_train, y_train, x_val, y_val):
    model = LinearRegression()
    model.fit(x_train, y_train)
    predicts = model.predict(x_val)
    rmse = np.sqrt(mean_squared_error(y_val, predicts))
    r2 = r2_score(y_val, predicts)
    return rmse, r2
    
def evalueate_principal_components_model(model_type: ModelType, x_train, y_train, x_val, y_val, n):
    model = None
    
    if model_type == ModelType.PCA:
        model = PCA(n_components=n)
        x_train_pca = model.fit_transform(x_train)
        x_val_pca = model.transform(x_val)
    elif model_type == ModelType.SVD:
        model = TruncatedSVD(n_components=n)
        x_train_pca = model.fit_transform(x_train)
        x_val_pca = model.transform(x_val)
    elif model_type == ModelType.LDA:
        model = LinearDiscriminantAnalysis(n_components=n)
        x_train_pca = model.fit_transform(x_train, y_train)
        x_val_pca = model.transform(x_val)
    else:
        raise ValueError("Modelo no soportado")
    
    rmse, r2 = lineal_regresion_model_apply(x_train_pca, y_train, x_val_pca, y_val)

    return rmse, r2, model


# Selección del mejor modelo PCA usando score balanceado
def find_best_principal_components_model(model_type: ModelType, x_train, y_train, x_val, y_val, rmse_baseline, alpha=1.0, beta=1.0):
    best_score = float("inf")
    best_n = None
    best_pca_model = None
    best_rmse = None
    best_r2 = None
    max_components = x_train.shape[1]

    print("\nFinding the principal components for the model ", model_type.value)

    for n in range(1, max_components + 1):
        rmse, r2, pca_model = evalueate_principal_components_model(model_type, x_train, y_train, x_val, y_val, n)
        r2_distance = abs(1 - r2)
        score = alpha * r2_distance + beta * (rmse / rmse_baseline)
        print(f"{model_type.value} {n} - R²: {r2:.4f} - RMSE: {rmse:.4f} - Score: {score:.4f}")
        
        # - The ideal R² is 1 ⇒ r2_distance = abs(1 - r2) measures how far you are from 1 (the smaller, the better).
        # - The ideal RMSE is 0 ⇒ rmse / rmse_baseline measures the relative error with respect to a baseline (the smaller, the better).
        if score < best_score:
            best_score = score
            best_n = n
            best_pca_model = pca_model
            best_rmse = rmse
            best_r2 = r2

    print(f"Best {model_type.value} Model: n: {best_n} R²: {best_r2:.4f} RMSE: {best_rmse:.4f} Score: {best_score:.4f}")
    return best_rmse, best_r2, best_pca_model

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = processed_ds.copy()   
y = df[PRICE]
X = df.drop(columns=[PRICE]).select_dtypes(include=["int64", "float64"]).drop(columns=[LOCATION, PRICE], errors="ignore")


# 2. Dividir el dataset en train (60%), validation (20%) y test (20%)
# Primero, separamos train (60%) y temp (40%)
x_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Ahora separamos temp en val (20%) y test (20%)
x_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# StandardScaler standardizes features by removing the mean and scaling to unit variance. The formula for each feature is:
# X_scaled = (X - μ) / σ
# μ = mean of the feature (calculated from the training set)
# σ = standard deviation of the feature (calculated from the training set)
# This transformation ensures that each feature has:
# Mean = 0
# Standard deviation = 1
# It is important for algorithms like PCA because they are sensitive to the scale of the variables.
# StandardScaler standardizes the features by removing the mean and scaling them to unit variance. Specifically, for each feature (column):
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)  # fit SOLO con train
x_val_std   = scaler.transform(x_val)
x_test_std  = scaler.transform(X_test)

# Evaluación del modelo sin PCA
pre_rmse, pre_r2_distance = lineal_regresion_model_apply(x_train_std, y_train, x_val_std, y_val)
print(f"Pre Regresion Model:  R²: {pre_r2_distance} RMSE: {pre_rmse:.4f}")

find_best_pca_model(x_train_std, y_train, x_val, y_val, pre_rmse, 0.7, 1)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_diabetes

# Cargar el dataset
diabetes = load_diabetes()
df = processed_ds.copy()
y = df[PRICE]
X = df.drop(columns=[PRICE])


# Dividir el dataset en train (60%), validation (20%) y test (20%)
x_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Escalar los datos
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train)
x_val_std = scaler.transform(x_val)
x_test_std = scaler.transform(x_test)

# Evaluar modelo sin PCA
pre_rmse, pre_r2 = lineal_regresion_model_apply(x_train_std, y_train, x_val_std, y_val)
print(f"Regresion without Reduction - R²: {pre_r2:.4f} - RMSE: {pre_rmse:.4f}\n")

# Buscar mejor modelo PCA
best_pca_rmse, best_pca_r2, best_pca_model = find_best_principal_components_model(ModelType.PCA, x_train_std, y_train, x_val_std, y_val, rmse_baseline=pre_rmse)
best_svd_rmse, best_svd_r2, best_svd_model = find_best_principal_components_model(ModelType.SVD, x_train_std, y_train, x_val_std, y_val, rmse_baseline=pre_rmse)
best_lda_rmse, best_lda_r2, best_lda_model = find_best_principal_components_model(ModelType.LDA, x_train_std, y_train, x_val_std, y_val, rmse_baseline=pre_rmse)


Regresion without Reduction - R²: 0.8292 - RMSE: 3240671.3906


Finding the principal components for the model  PCA
PCA 1 - R²: -0.0067 - RMSE: 7867534.1017 - Score: 3.4344
PCA 2 - R²: 0.6591 - RMSE: 4578148.7678 - Score: 1.7536
PCA 3 - R²: 0.6548 - RMSE: 4607071.9221 - Score: 1.7668
PCA 4 - R²: 0.6502 - RMSE: 4637735.1586 - Score: 1.7809
PCA 5 - R²: 0.6540 - RMSE: 4612193.9501 - Score: 1.7692
PCA 6 - R²: 0.6664 - RMSE: 4529302.2138 - Score: 1.7313
PCA 7 - R²: 0.6565 - RMSE: 4595709.2089 - Score: 1.7616
PCA 8 - R²: 0.6563 - RMSE: 4597443.0946 - Score: 1.7624
PCA 9 - R²: 0.6688 - RMSE: 4512723.5048 - Score: 1.7237
PCA 10 - R²: 0.6712 - RMSE: 4496622.5753 - Score: 1.7164
PCA 11 - R²: 0.6705 - RMSE: 4501398.2036 - Score: 1.7186
PCA 12 - R²: 0.6780 - RMSE: 4449740.9950 - Score: 1.6951
PCA 13 - R²: 0.6803 - RMSE: 4433502.9986 - Score: 1.6878
PCA 14 - R²: 0.6708 - RMSE: 4499342.7249 - Score: 1.7176
PCA 15 - R²: 0.6778 - RMSE: 4451285.9163 - Score: 1.6958
PCA 16 - R²: 0.7075 - RMSE: 4240962.6

## Export Processed Dataset

Export the processed dataset

In [None]:
processed_ds.to_csv("../../../datasets/raw/hyderabad_house_price_processed.csv", index=False)