In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from google.colab import drive
import zipfile

drive.mount('/content/drive')


zip_path = "/content/drive/MyDrive/house2_data.zip"
extract_dir = "/content/house2_data_unzipped"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
# raw dataset
df = pd.read_csv("/content/house2_data_unzipped/housing.csv")


# Rename target and remove NA rows
df = df.dropna(subset=["median_house_value"])
y = df["median_house_value"].values
X = df.drop(columns=["median_house_value"])

# Identify Column Types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Custom Transformer: Outlier Removal (IQR)
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor

    def fit(self, X, y=None):
        self.Q1_ = np.percentile(X, 25, axis=0)
        self.Q3_ = np.percentile(X, 75, axis=0)
        self.IQR_ = self.Q3_ - self.Q1_
        self.lower_ = self.Q1_ - self.factor * self.IQR_
        self.upper_ = self.Q3_ + self.factor * self.IQR_
        return self

    def transform(self, X):
        return np.clip(X, self.lower_, self.upper_)

# Preprocessing Pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("outlier_remover", OutlierRemover(factor=1.5)),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
X_preprocessed = preprocessor.fit_transform(X)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
num_features = num_cols

# Get the name of the columns that got hot encoded
cat_features = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(cat_cols)

# Combine the both type of columns
feature_names = np.concatenate([num_features, cat_features])
# Construct a full dataset with the real column names
X_df = pd.DataFrame(X_preprocessed, columns=feature_names)
y_series = pd.Series(y, index=X_df.index, name="median_house_value")
X_df["median_house_value"] = y_series  #Add the target as the final column

# Save to MyDrive
X_df.to_csv("/content/drive/MyDrive/house2_prices_preprocessed.csv", index=False)

In [9]:
print(X_df)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0      -1.327835  1.052548            0.982143    -1.117285       -1.309916   
1      -1.322844  1.043185           -0.607019     2.329936        2.128690   
2      -1.332827  1.038503            1.856182    -0.697327       -1.095223   
3      -1.337818  1.038503            1.856182    -0.835405       -0.936843   
4      -1.337818  1.038503            1.856182    -0.582857       -0.778463   
...          ...       ...                 ...          ...             ...   
20635  -0.758826  1.801647           -0.289187    -0.555671       -0.447624   
20636  -0.818722  1.806329           -0.845393    -1.248209       -1.236005   
20637  -0.823713  1.778237           -0.924851    -0.134281       -0.056954   
20638  -0.873626  1.778237           -0.845393    -0.416162       -0.324440   
20639  -0.833696  1.750146           -1.004309     0.245613        0.404108   

       population  households  median_income  ocean