## Data Loading and Exploration

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
train_df = pd.read_csv('train.csv')   #Load the data
test_df = pd.read_csv('test.csv')
y = train_df['SalePrice']
train_df = train_df.drop('SalePrice', axis=1) #the target variable

all_data = pd.concat([train_df, test_df], axis=0, ignore_index=True)# train and test for preprocessing

## Exploration Missing Value Analysis

In [7]:
train.shape, test.shape
train.info()
train.describe()
# Analyze missing values
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0: 'Missing Values', 1: '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    return mis_val_table_ren_columns

missing_df = missing_values_table(all_data)
print(missing_df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int32  
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

## Handling Missing Values


In [8]:
# Features where NA means "None"
none_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                 'BsmtFinType2', 'MasVnrType']

for feature in none_features:
    all_data[feature] = all_data[feature].fillna('None')

# Features to fill with 0
zero_features = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1',
                 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
                 'BsmtHalfBath', 'MasVnrArea']

for feature in zero_features:
    all_data[feature] = all_data[feature].fillna(0)

# Features to fill with mode
mode_features = ['Electrical', 'MSZoning', 'Functional', 'Utilities',
                 'SaleType', 'KitchenQual', 'Exterior1st', 'Exterior2nd']

for feature in mode_features:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])

# LotFrontage: Fill with median of the neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

## Feature Engineering

In [9]:
# Create new features
# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBathrooms'] = (all_data['FullBath'] + 
                               (0.5 * all_data['HalfBath']) + 
                               all_data['BsmtFullBath'] + 
                               (0.5 * all_data['BsmtHalfBath']))

# Total porch square footage
porch_features = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
all_data['TotalPorchSF'] = all_data[porch_features].sum(axis=1)

# Has features
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasBsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

# Age features
all_data['Age'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['AgeRemod'] = all_data['YrSold'] - all_data['YearRemodAdd']

# Quality features
all_data['OverallScore'] = all_data['OverallQual'] * all_data['OverallCond']
all_data['GarageScore'] = all_data['GarageQual'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 
                                                       'Fa': 2, 'Po': 1, 'None': 0}) * \
                         all_data['GarageCond'].map({'Ex': 5, 'Gd': 4, 'TA': 3, 
                                                     'Fa': 2, 'Po': 1, 'None': 0})

## Encoding Categorical Variables

In [10]:
# Ordinal encoding for features with clear ordering
ordinal_features = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'Functional': {'Typ': 0, 'Min1': 1, 'Min2': 2, 'Mod': 3, 'Maj1': 4, 'Maj2': 5, 'Sev': 6, 'Sal': 7},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
}

for feature, mapping in ordinal_features.items():
    all_data[feature] = all_data[feature].map(mapping)

# One-hot encoding for nominal features
nominal_features = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=nominal_features, drop_first=True)

## Outlier Detection and Removal

In [11]:
# Remove outliers from training data (only for specific features)
def remove_outliers(df, features, n_std=3):
    outlier_indices = []
    for col in features:
        mean = df[col].mean()
        sd = df[col].std()
        outlier_list_col = df[(df[col] < mean - n_std * sd) | 
                              (df[col] > mean + n_std * sd)].index
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = list(set(outlier_indices))
    return outlier_indices

# Apply to training data only
train_indices = list(range(len(train_df)))
outlier_features = ['GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'TotalSF']
outlier_idx = remove_outliers(all_data.iloc[train_indices], outlier_features)

# Remove outliers from training data
clean_train_indices = [idx for idx in train_indices if idx not in outlier_idx]

## Feature Scaling and Transformation

In [12]:
# Log transform skewed features
from scipy import stats
from scipy.stats import skew

# Identify numeric features
numeric_features = all_data.select_dtypes(include=[np.number]).columns

# Calculate skewness
skewed_features = all_data[numeric_features].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew': skewed_features})

# Transform features with high skewness
skewed_features = skewness[abs(skewness['Skew']) > 0.75].index
for feat in skewed_features:
    all_data[feat] = np.log1p(all_data[feat])

# Also transform target variable
y_log = np.log1p(y)

In [13]:
# Remove features with low variance
from sklearn.feature_selection import VarianceThreshold

# Apply variance threshold
variance_selector = VarianceThreshold(threshold=0.01)
all_data_variance = variance_selector.fit_transform(all_data)

# Get selected feature names
selected_features = all_data.columns[variance_selector.get_support()]
all_data_selected = all_data[selected_features]

## Final Data Preparation

In [14]:
# Split back to train and test
train_processed = all_data_selected.iloc[:len(train_df)]
test_processed = all_data_selected.iloc[len(train_df):]

# Apply clean indices (remove outliers) to training data
train_processed_clean = train_processed.iloc[clean_train_indices]
y_clean = y_log.iloc[clean_train_indices]

# Scale the features
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_processed_clean)
test_scaled = scaler.transform(test_processed)

# Create train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    train_scaled, y_clean, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {test_scaled.shape}")

Training set shape: (1148, 155)
Validation set shape: (288, 155)
Test set shape: (1459, 155)
