In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from google.colab import drive
import zipfile
─
drive.mount('/content/drive')

zip_path = "/content/drive/MyDrive/house-prices-advanced-regression-techniques.zip"
extract_dir = "/content/house-prices-data"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
# raw dataset
df = pd.read_csv("/content/house-prices-data/train.csv")

# Delete Useless Columns
df.drop(columns=["Id"], inplace=True)

# Split Features and Target
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])

# Identify Column Types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Custom Transformer to Remove Outliers via IQR
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor

    def fit(self, X, y=None):
        self.Q1_ = np.percentile(X, 25, axis=0)
        self.Q3_ = np.percentile(X, 75, axis=0)
        self.IQR_ = self.Q3_ - self.Q1_
        self.lower_ = self.Q1_ - self.factor * self.IQR_
        self.upper_ = self.Q3_ + self.factor * self.IQR_
        return self

    def transform(self, X):
        return np.clip(X, self.lower_, self.upper_)

# Preprocessing Pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("outlier_remover", OutlierRemover(factor=1.5)),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# Apply Preprocessing
X_preprocessed = preprocessor.fit_transform(X)


Mounted at /content/drive


In [2]:

num_features = num_cols

# Get the name of the columns that got hot encoded
cat_features = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(cat_cols)

# Combine the both type of columns
feature_names = np.concatenate([num_features, cat_features])
# Construct a full dataset with the real column names
X_df = pd.DataFrame(X_preprocessed, columns=feature_names)
X_df["SalePrice"] = y.values  #Add the target as the final column

# Save to MyDrive
X_df.to_csv("/content/drive/MyDrive/house_prices_preprocessed.csv", index=False)

In [4]:
print("Stats of Saleprice:")
print(y.describe())

Stats of Saleprice:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
