In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer


In [2]:
data = pd.read_csv('housing.csv')

In [3]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
data['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [None]:
# before we simply jump into the pytorch I suggest to do the following 
# 1) create OHE for ocean_proximity
# 2) create mean encoding for ocean proximity
# 3) use sklearn to 
#    a) split in train / test
#    b) normalize and standardize the data
# 4) jump into pytorch for some experiments
#    a) vanilla MLP used in Task 1
#    b) use AE to encode the features, then MLP on the features
#    c) maybe something more exotic

In [29]:
# as the dataset is rather small, create new variables to later use sklearn pipeline

target = 'median_house_value'
X = data.drop(target, axis=1)
y = data[target]

categorical_cols = ['ocean_proximity']
numeric_cols = [x for x in X.columns if x not in ['ocean_proximity'] ]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),  # One-hot encode 'ocean_proximity'
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
            ('scaler', StandardScaler()),  # Standardize numeric columns
            ('normalizer', Normalizer())  # Normalize numeric columns
        ]), numeric_cols)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing step
])

X_transformed = pipeline.fit_transform(X)

# Convert the transformed data back to a DataFrame
columns = (pipeline.named_steps['preprocessor']
           .named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist() +
           numeric_cols)
X_transformed_df = pd.DataFrame(X_transformed, columns=columns)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [30]:
X_transformed_df

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,0.0,0.0,0.0,1.0,0.0,-0.370614,0.293778,0.274127,-0.224634,-0.272197,-0.271974,-0.272701,0.654451
1,0.0,0.0,0.0,1.0,0.0,-0.310090,0.244535,-0.142292,0.479581,0.317649,0.201931,0.391459,0.546704
2,0.0,0.0,0.0,1.0,0.0,-0.387358,0.301819,0.539460,-0.155703,-0.241144,-0.238541,-0.245185,0.518104
3,0.0,0.0,0.0,1.0,0.0,-0.440567,0.341997,0.611274,-0.205565,-0.237899,-0.252267,-0.241647,0.307243
4,0.0,0.0,0.0,1.0,0.0,-0.476604,0.369971,0.661273,-0.164733,-0.219120,-0.270699,-0.224140,-0.004589
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.0,1.0,0.0,0.0,0.0,-0.304887,0.723878,-0.116191,-0.178789,-0.157042,-0.205953,-0.178172,-0.488625
20636,0.0,1.0,0.0,0.0,0.0,-0.277883,0.613086,-0.286935,-0.301635,-0.314002,-0.320541,-0.342268,-0.234734
20637,0.0,1.0,0.0,0.0,0.0,-0.330415,0.713302,-0.370984,-0.070196,-0.050585,-0.148232,-0.069813,-0.458327
20638,0.0,1.0,0.0,0.0,0.0,-0.342836,0.697831,-0.331757,-0.139548,-0.120624,-0.237195,-0.154520,-0.413849


In [37]:
data.groupby('ocean_proximity')['median_house_value'].mean()

ocean_proximity
<1H OCEAN     240084.285464
INLAND        124805.392001
ISLAND        380440.000000
NEAR BAY      259212.311790
NEAR OCEAN    249433.977427
Name: median_house_value, dtype: float64

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']