In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
full_data = pd.concat([train, test], sort=False)  # Combine for uniform preprocessing

print(train.shape)
print(test.shape)
train.head()


In [None]:
missing = full_data.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


In [None]:
none_fill = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in none_fill:
    full_data[col] = full_data[col].fillna("None")


In [None]:
zero_fill = ['GarageYrBlt', 'GarageArea', 'GarageCars',
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
             'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
for col in zero_fill:
    full_data[col] = full_data[col].fillna(0)

# LotFrontage: group-wise median
full_data['LotFrontage'] = full_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))


In [None]:
full_data['TotalSF'] = full_data['TotalBsmtSF'] + full_data['1stFlrSF'] + full_data['2ndFlrSF']


In [None]:
full_data['HouseAge'] = full_data['YrSold'] - full_data['YearBuilt']
full_data['RemodAge'] = full_data['YrSold'] - full_data['YearRemodAdd']
full_data['IsRemod'] = (full_data['YearBuilt'] != full_data['YearRemodAdd']).astype(int)


In [None]:
full_data['TotalBath'] = (full_data['BsmtFullBath'] + full_data['FullBath'] +
                          0.5 * (full_data['BsmtHalfBath'] + full_data['HalfBath']))


In [None]:
full_data['TotalPorchSF'] = (full_data['OpenPorchSF'] + full_data['EnclosedPorch'] +
                             full_data['3SsnPorch'] + full_data['ScreenPorch'])


In [None]:
from sklearn.preprocessing import LabelEncoder

ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual',
                'GarageCond', 'PoolQC']
for col in ordinal_cols:
    le = LabelEncoder()
    full_data[col] = le.fit_transform(full_data[col].astype(str))


In [None]:
full_data = pd.get_dummies(full_data, drop_first=True)


In [None]:
from scipy.stats import skew

num_feats = full_data.dtypes[full_data.dtypes != "object"].index
skewed_feats = full_data[num_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
high_skew = skewed_feats[skewed_feats > 0.75]

# Apply log1p transformation
for feat in high_skew.index:
    full_data[feat] = np.log1p(full_data[feat])


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = ['GrLivArea', 'TotalSF', 'LotArea', 'TotalBath']
full_data[scaled_features] = scaler.fit_transform(full_data[scaled_features])


In [None]:
train_prepared = full_data[:len(train)]
test_prepared = full_data[len(train):]
y = train['SalePrice']

# Apply log transformation to SalePrice (reduces skew)
y = np.log1p(y)
