## **Cleaning**

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

#### Load dataset

In [9]:
df = pd.read_csv("../train.csv")

#### Step 1: Drop values that are missing or not common among the data

In [10]:
drop_cols = ["Alley", "PoolQC", "Fence", "MiscFeature"]
df = df.drop(columns = drop_cols)

#### Step 2: Seperate the data and organize by numerical and catergorical columns

In [11]:
y = df["SalePrice"]
X = df.drop(columns=["SalePrice", "Id"])

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

#### Step 3: Preprocessing steps

In [12]:
num_transformer = SimpleImputer(strategy="median")
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(transformers=[
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

#### Step 4: Transform the cleaned dataset

In [13]:
X_clean = preprocess.fit_transform(X)
X_clean.shape

(1460, 274)