In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_file_path = "../data/raw/train.csv"
test_file_path = "../data/raw/test.csv"

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

Outliers

In [None]:
outliers = train[train['GrLivArea'] > 4000].index
train.drop(outliers, inplace=True)

Missing Values:

In [4]:
def create_summary_table(df):
    summary_df = pd.DataFrame({
        'Column': df.columns,
        'Missing %': df.isnull().mean() * 100
    }).sort_values(by='Missing %', ascending=False)
    return summary_df

summary_table = create_summary_table(train)
columns_highmissing = summary_table[summary_table['Missing %'] > 10]['Column'].tolist()
print(columns_highmissing)

train.drop(columns=columns_highmissing, axis=1, inplace=True)
test.drop(columns=columns_highmissing, axis=1, inplace=True)

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu', 'LotFrontage']


Drop SalePrice

In [5]:
if 'SalePrice' in train.columns:
    y_train = train['SalePrice'].copy()  
    X_train = train.drop(columns=['SalePrice']).copy()
else:
    raise KeyError("SalePrice not found in train dataset.")

X_test = test.copy()

Separate numerical and categorical columns

In [6]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

num_imputer = SimpleImputer(strategy='mean')
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

One Hot Encoding

In [7]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
train_encoded = encoder.fit_transform(X_train[cat_cols])
test_encoded = encoder.transform(X_test[cat_cols])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(cat_cols), index=X_train.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(cat_cols), index=X_test.index)

missing_cols = set(train_encoded_df.columns) - set(test_encoded_df.columns)
for col in missing_cols:
    test_encoded_df[col] = 0
test_encoded_df = test_encoded_df[train_encoded_df.columns]

X_train.drop(columns=cat_cols, inplace=True)
X_test.drop(columns=cat_cols, inplace=True)

X_train = pd.concat([X_train, train_encoded_df], axis=1)
X_test = pd.concat([X_test, test_encoded_df], axis=1)

In [8]:
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)