### Data Processing (Cleanup + Feature Engineering)

In [0]:
import pandas as pd

# load the dataset saved in prior notebook
df = pd.read_parquet('/dbfs/tmp/housing_data.parquet')

In [0]:
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# define processing for columns based on type
numerical_features = df.select_dtypes(include=['int64','float64']).columns
numerical_transformer = SimpleImputer(strategy='mean')
categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# combine processing steps 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# apply processing steps 
df_processed = preprocessor.fit_transform(df)

# convert to DataFrame 
df_processed = pd.DataFrame(
    df_processed,
    columns=numerical_features.tolist()+preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
)

# create new features
df_processed['rooms_per_household'] = df_processed['total_rooms'] / df_processed['households']
df_processed['bedrooms_per_room'] = df_processed['total_bedrooms'] / df_processed['total_rooms']
df_processed['population_per_household'] = df_processed['population'] / df_processed['households']

# clean up column name for subsequent processing steps
df_processed = df_processed.rename(columns={'ocean_proximity_<1H OCEAN':'ocean_proximity_1H OCEAN'})

### Split Data

In [0]:
from sklearn.model_selection import train_test_split

# split data into train/val/test
X = df_processed.drop(columns=['median_house_value'])
y = df_processed['median_house_value']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [0]:
# save to parquet files so we can access objects in other tasks
X_train.to_parquet('/dbfs/tmp/X_train.parquet')
X_val.to_parquet('/dbfs/tmp/X_val.parquet')
X_test.to_parquet('/dbfs/tmp/X_test.parquet')
y_train.to_frame().to_parquet('/dbfs/tmp/y_train.parquet') # object must be a dataframe to use to_parquet()
y_val.to_frame().to_parquet('/dbfs/tmp/y_val.parquet') # object must be a dataframe to use to_parquet()
y_test.to_frame().to_parquet('/dbfs/tmp/y_test.parquet') # object must be a dataframe to use to_parquet()