In [91]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [92]:
df = pd.read_csv(r'..\data\raw\realtor-data-clean.csv')

In [93]:
df['bed_bath_ratio'] = df['bed'] / (df['bath'] + 1e-6)

In [94]:
df['price_per_sqft'] = df['price'] / df['house_size']

In [95]:
df['total_rooms'] = df['bed'] + df['bath']

In [96]:
price_map = {'Low': 0, 'Medium': 1, 'High': 2, 'Luxury': 3}
df['price_category'] = df['price_category'].map(price_map)

In [97]:
property_map = {'House':0,'Mansion':1}
df['property_type'] = df['property_type'].map(property_map)

In [98]:
df.drop(columns=['brokered_by', 'street','city'], inplace=True)

In [99]:
df['bed_group'] = df['bed_group'].replace('10+', '11').astype(int)
df['bath_group'] = df['bath_group'].replace('10+', '11').astype(int)

In [100]:
df['bed_group_10o+'] = df['bed_group'].apply(lambda x: 1 if x == 11 else 0)
df['bath_group_10o+'] = df['bath_group'].apply(lambda x: 1 if x == 11 else 0)

In [101]:
df.drop(columns=['bed', 'bath'], inplace=True)

In [102]:
state_map = {
    "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA",
    "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA",
    "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA",
    "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD",
    "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO",
    "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ",
    "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH",
    "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC",
    "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT",
    "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY",
    "District of Columbia": "DC", "Puerto Rico": "PR", "Virgin Islands": "VI", "Guam": "GU"
}

# 1. Mapear abreviaciones
df["state_abbr"] = df["state"].map(state_map).fillna("Other")

# 2. Crear 'region_group' desde abreviaciones
noreste = ["NY", "MA", "CT", "PA", "NJ", "RI", "NH", "VT", "ME"]
medio_oeste = ["IL", "OH", "MI", "WI", "MN", "IA", "MO", "IN", "KS", "NE", "ND", "SD"]
sur = ["TX", "FL", "GA", "NC", "SC", "AL", "TN", "KY", "MS", "LA", "VA", "AR", "WV", "OK", "DC", "DE", "MD"]
oeste = ["CA", "WA", "OR", "NV", "CO", "AZ", "UT", "NM", "HI", "AK", "ID", "MT", "WY"]

def assign_region(state):
    if state in noreste:
        return "Northeast"
    elif state in medio_oeste:
        return "Midwest"
    elif state in sur:
        return "South"
    elif state in oeste:
        return "West"
    else:
        return "Other"

df["region_group"] = df["state_abbr"].apply(assign_region)

In [103]:
encoder = ['status','region_group']
dummies = pd.get_dummies(df[encoder], drop_first=True, dtype=int)

df = df.drop(columns=encoder).join(dummies)

In [104]:
df.drop(columns=['state','state_abbr'],inplace=True)

In [105]:
# df = pd.get_dummies(df, columns=['state', 'status'], prefix=['state', 'status'],dtype=int,drop_first=True)
# le = LabelEncoder()
# df['state'] = le.fit_transform(df['state'])

In [107]:
df.to_parquet(r'..\data\processed\realtor-ml.parquet')

### Eliminaciones
Se eliminaron las columnas: brokered_by, street, city, bed, bath.

### Variables nuevas creadas
- Columna nueva	Descripción
- bed_bath_ratio: Relación entre número de camas y baños
- price_per_sqft: Precio dividido entre el tamaño de la casa.
- total_rooms: Suma total debed + bath
- price_category: Convertido a ordinal con mapeo: Low=0, ...,Luxury=3
- property_type: Convertido al binario: House=0,Mansion=1

### Agrupación y tratamiento de extremos
- bed_groupy bath_group: se reemplazó '10+'por '11'y se convirtió en int.
- Se crearon banderas binarias:
- bed_group_10o+: 1 sibed_group == 11
- bath_group_10o+: 1 sibath_group == 11

### Codificación categórica
Se aplicó One-Hot Encoding con pd.get_dummies() en: state, status con dtype=int para asegurar que las columnas resultantes sean 0 y 1.