In [16]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [17]:
train_file_path = "../data/raw/train.csv"
test_file_path = "../data/raw/test.csv"

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

Outliers

In [18]:

# train.drop(train[train['GrLivArea'] > 4000].index, inplace=True)
# train.drop(train[train['SalePrice'] < 40000].index, inplace=True)
# train.drop(train[train['SalePrice'] > 500000].index, inplace=True)

In [19]:
outlier_dict = {}
for col in train.columns:
    if train[col].dtype != 'object':
        q1, q3 = train[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound, upper_bound = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        outliers = train[(train[col] < lower_bound) | (train[col] > upper_bound)][col]
        outlier_dict[col] = outliers.tolist()
print(outlier_dict)

{'Id': [], 'MSSubClass': [190, 190, 160, 180, 160, 190, 160, 190, 160, 190, 160, 160, 160, 160, 160, 160, 160, 160, 160, 190, 160, 190, 190, 190, 190, 160, 160, 160, 190, 160, 160, 180, 180, 190, 180, 160, 160, 160, 190, 190, 160, 160, 160, 180, 160, 190, 190, 180, 160, 160, 160, 160, 190, 190, 190, 160, 160, 160, 160, 160, 190, 160, 160, 160, 160, 160, 190, 160, 160, 190, 160, 160, 190, 160, 180, 190, 160, 160, 160, 160, 160, 190, 160, 160, 190, 190, 160, 160, 160, 160, 190, 160, 180, 160, 160, 160, 160, 160, 160, 190, 190, 180, 180], 'LotFrontage': [112.0, 115.0, 24.0, 21.0, 121.0, 122.0, 24.0, 120.0, 134.0, 141.0, 24.0, 24.0, 174.0, 21.0, 21.0, 174.0, 21.0, 21.0, 120.0, 129.0, 140.0, 120.0, 118.0, 116.0, 150.0, 21.0, 130.0, 21.0, 24.0, 21.0, 137.0, 21.0, 21.0, 24.0, 130.0, 24.0, 21.0, 21.0, 21.0, 120.0, 24.0, 24.0, 144.0, 114.0, 24.0, 21.0, 128.0, 116.0, 149.0, 21.0, 313.0, 24.0, 24.0, 24.0, 122.0, 130.0, 121.0, 21.0, 115.0, 21.0, 21.0, 21.0, 120.0, 24.0, 24.0, 24.0, 114.0, 168.0, 1

In [20]:
# Remove outliers defined in outlier_dict
for col, outliers in outlier_dict.items():
    for outlier in outliers:
        train.drop(train[train[col] == outlier].index, inplace=True)



Convert MSSubClass

In [21]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass'] = test['MSSubClass'].astype(str)

Missing Values

In [22]:
def create_summary_table(df):
    return pd.DataFrame({'Column': df.columns, 'Missing %': df.isnull().mean() * 100}).sort_values(by='Missing %', ascending=False)
summary_table = create_summary_table(train)
high_missing = summary_table[summary_table['Missing %'] > 10]['Column'].tolist()
print(high_missing)
train.drop(columns=high_missing, inplace=True)
test.drop(columns=high_missing, inplace=True)

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 'LotFrontage']


Drop Sale Price

In [23]:
if 'SalePrice' in train.columns:
    y_train = train['SalePrice'].copy()
    X_train = train.drop(columns=['SalePrice']).copy()
else:
    raise KeyError("SalePrice not found in train dataset.")
X_test = test.copy()

Feature Engineering

In [24]:
X_train['TotalBath'] = X_train['FullBath'] + X_train['HalfBath'] + X_train['BsmtFullBath'] + X_train['BsmtHalfBath']
X_test['TotalBath'] = X_test['FullBath'] + X_test['HalfBath'] + X_test['BsmtFullBath'] + X_test['BsmtHalfBath']

X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train['2ndFlrSF']
X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']

X_train['Age'] = X_train['YrSold'] - X_train['YearBuilt']
X_test['Age'] = X_test['YrSold'] - X_test['YearBuilt']

X_train['AgeSinceRemodel'] = X_train['YrSold'] - X_train['YearRemodAdd']
X_test['AgeSinceRemodel'] = X_test['YrSold'] - X_test['YearRemodAdd']

X_train['OverallQual*GrLivArea'] = X_train['OverallQual'] * X_train['GrLivArea']
X_test['OverallQual*GrLivArea'] = X_test['OverallQual'] * X_test['GrLivArea']

drop_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearBuilt', 'YrSold', 'YearRemodAdd', 'OverallQual']
X_train.drop(columns=drop_cols, inplace=True)
X_test.drop(columns=drop_cols, inplace=True)

In [25]:
def encode_interior_features(X_train, X_test):
    heating_qc_map = {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    kitchen_map = {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    X_train["HeatingQC"] = X_train["HeatingQC"].fillna("NA").map(heating_qc_map).fillna(0).astype(int)
    X_test["HeatingQC"] = X_test["HeatingQC"].fillna("NA").map(heating_qc_map).fillna(0).astype(int)
    X_train["KitchenQual"] = X_train["KitchenQual"].fillna("NA").map(kitchen_map).fillna(0).astype(int)
    X_test["KitchenQual"] = X_test["KitchenQual"].fillna("NA").map(kitchen_map).fillna(0).astype(int)
    return X_train, X_test

def encode_basement_features(X_train, X_test):
    basement_mappings = {
        "BsmtQual": {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtCond": {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "BsmtExposure": {"NA": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4},
        "BsmtFinType1": {"NA": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
        "BsmtFinType2": {"NA": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
    }
    for col, mapping in basement_mappings.items():
        X_train[col] = X_train[col].fillna("NA").map(mapping).fillna(0).astype(int)
        X_test[col] = X_test[col].fillna("NA").map(mapping).fillna(0).astype(int)
    return X_train, X_test

def encode_exterior_features(X_train, X_test):
    exter_qual_map = {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
    paved_drive_map = {"NA": 0, "N": 1, "P": 2, "Y": 3}
    for col, mapping in [("ExterQual", exter_qual_map), ("PavedDrive", paved_drive_map)]:
        X_train[col] = X_train[col].fillna("NA").map(mapping).fillna(0).astype(int)
        X_test[col] = X_test[col].fillna("NA").map(mapping).fillna(0).astype(int)
    roof_matl_map = {"NA": 0, "Roll": 1, "Tar&Grv": 2, "Metal": 3, "Membran": 4, "ClyTile": 5, "WdShngl": 6, "WdShake": 7, "CompShg": 8}
    X_train["RoofMatl"] = X_train["RoofMatl"].fillna("NA").map(roof_matl_map).fillna(0).astype(int)
    X_test["RoofMatl"] = X_test["RoofMatl"].fillna("NA").map(roof_matl_map).fillna(0).astype(int)
    for col in ["Exterior1st", "Exterior2nd"]:
        X_train[col] = X_train[col].fillna("NA").astype("category")
        X_test[col] = X_test[col].fillna("NA").astype("category")
        X_test[col] = X_test[col].cat.set_categories(X_train[col].cat.categories)
        X_train[col] = X_train[col].cat.codes
        X_test[col] = X_test[col].cat.codes
    foundation_map = {"NA": 0, "Wood": 1, "Slab": 2, "BrkTil": 3, "CBlock": 4, "PConc": 5, "Stone": 6}
    X_train["Foundation"] = X_train["Foundation"].fillna("NA").map(foundation_map).fillna(0).astype(int)
    X_test["Foundation"] = X_test["Foundation"].fillna("NA").map(foundation_map).fillna(0).astype(int)
    X_train["TotalPorchSF"] = X_train["OpenPorchSF"] + X_train["EnclosedPorch"] + X_train["3SsnPorch"] + X_train["ScreenPorch"]
    X_test["TotalPorchSF"] = X_test["OpenPorchSF"] + X_test["EnclosedPorch"] + X_test["3SsnPorch"] + X_test["ScreenPorch"]
    X_train["MasVnrArea"] = X_train["MasVnrArea"].fillna(0)
    X_test["MasVnrArea"] = X_test["MasVnrArea"].fillna(0)
    return X_train, X_test

def encode_garage_features(X_train, X_test):
    garage_mappings = {
        "GarageFinish": {"NA": 0, "Unf": 1, "RFn": 2, "Fin": 3},
        "GarageQual": {"NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
        "GarageType": {"NA": 0, "CarPort": 1, "Detchd": 2, "Attchd": 3, "Basment": 4, "BuiltIn": 5, "2Types": 6}
    }
    for col, mapping in garage_mappings.items():
        X_train[col] = X_train[col].fillna("NA").map(mapping).fillna(0).astype(int)
        X_test[col] = X_test[col].fillna("NA").map(mapping).fillna(0).astype(int)
    return X_train, X_test

def encode_all_features(X_train, X_test):
    X_train, X_test = encode_interior_features(X_train, X_test)
    X_train, X_test = encode_basement_features(X_train, X_test)
    X_train, X_test = encode_exterior_features(X_train, X_test)
    X_train, X_test = encode_garage_features(X_train, X_test)
    return X_train, X_test


In [26]:
X_train, X_test = encode_all_features(X_train, X_test)

One Hot Encoding

In [27]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy='mean')

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_encoded = encoder.fit_transform(X_train[cat_cols])
test_encoded = encoder.transform(X_test[cat_cols])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(cat_cols), index=X_train.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(cat_cols), index=X_test.index)

for col in set(train_encoded_df.columns) - set(test_encoded_df.columns):
    test_encoded_df[col] = 0
test_encoded_df = test_encoded_df[train_encoded_df.columns]

X_train.drop(columns=cat_cols, inplace=True)
X_test.drop(columns=cat_cols, inplace=True)
X_train = pd.concat([X_train, train_encoded_df], axis=1)
X_test = pd.concat([X_test, test_encoded_df], axis=1)


In [28]:
# print the X test object column
print(X_test.select_dtypes(include=['object']).columns)


Index([], dtype='object')


In [29]:
# Check sum of data types in X_train and X_test
print(X_test.dtypes.value_counts())



float64    143
int32       14
int8         2
Name: count, dtype: int64


In [30]:
X_train.to_csv('../data/processed/X_train_eng.csv', index=False)
y_train.to_csv('../data/processed/y_train_eng.csv', index=False)
X_test.to_csv('../data/processed/X_test_eng.csv', index=False)

In [31]:
# combine X_train and y_train
train = pd.concat([X_train, y_train], axis=1)
train.to_csv('../data/processed/train_eng.csv', index=False)
test.to_csv('../data/processed/test_eng.csv', index=False)