In [125]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer

In [127]:
# Load the dataset
housing_data = pd.read_csv("housing_data.csv")

In [129]:
# Display initial dataset information
print("Initial Data Shape:", housing_data.shape)
print(housing_data.head())

Initial Data Shape: (1460, 80)
  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape LandContour  \
0       SC60       RL           65     8450   Pave   NaN      Reg         Lvl   
1       SC20       RL           80     9600   Pave   NaN      Reg         Lvl   
2       SC60       RL           68    11250   Pave   NaN      IR1         Lvl   
3       SC70       RL           60     9550   Pave   NaN      IR1         Lvl   
4       SC60       RL           84    14260   Pave   NaN      IR1         Lvl   

  Utilities LotConfig  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0    AllPub    Inside  ...        0     No    No          No       0    Feb   
1    AllPub       FR2  ...        0     No    No          No       0    May   
2    AllPub    Inside  ...        0     No    No          No       0    Sep   
3    AllPub    Corner  ...        0     No    No          No       0    Feb   
4    AllPub       FR2  ...        0     No    No          No       0    Dec   

   YrSo

In [155]:
# Identify numerical and categorical columns
numeric_columns = housing_data.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = housing_data.select_dtypes(include=['object']).columns

In [160]:
#Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
housing_data[numeric_columns] = num_imputer.fit_transform(housing_data[numeric_columns])

#Impute missing values in categorical columns with the mode
cat_imputer = SimpleImputer(strategy='most_frequent')
housing_data[categorical_columns] = cat_imputer.fit_transform(housing_data[categorical_columns])

# Create New Features

In [131]:
# 1. Total Square Footage Feature
housing_data['TotalSF'] = housing_data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']].sum(axis=1)

In [133]:
# 2. House Age Feature
current_year = 2024
housing_data['HouseAge'] = current_year - housing_data['YearBuilt']

In [139]:
# 3. Binning House Age into Categories
bins = [0, 10, 20, 30, 40, np.inf]
labels = ['0-10', '11-20', '21-30', '31-40', '40+']
housing_data['HouseAgeGroup'] = pd.cut(housing_data['HouseAge'], bins=bins, labels=labels)

In [145]:
# 4. One-Hot Encoding for 'Neighborhood' Feature
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Avoiding dummy variable trap
encoded_neighborhood = encoder.fit_transform(housing_data[['Neighborhood']])
neighborhood_df = pd.DataFrame(encoded_neighborhood, columns=encoder.get_feature_names_out(['Neighborhood']))
housing_data = pd.concat([housing_data, neighborhood_df], axis=1)

In [147]:
# 5. Interaction Feature: Overall Quality × Living Area
housing_data['OverallQual_GrLivArea'] = housing_data['OverallQual'] * housing_data['GrLivArea']

In [149]:
#Log Transformation of 'SalePrice'
housing_data['LogSalePrice'] = np.log1p(housing_data['SalePrice'])

# **Feature Aggregation**

In [151]:
 # Aggregating Numerical Features
housing_data['TotalSF'] = housing_data[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']].sum(axis=1)

#  Aggregating Categorical Features
housing_data['AllNeighborhoods'] = housing_data[['Neighborhood', 'Condition1', 'Condition2']].astype(str).agg('_'.join, axis=1)
# Aggregating by 'MasVnrArea'
data_aggregated = housing_data.groupby('MasVnrArea').agg({'LotFrontage': 'mean', 'LotArea': 'count'}).reset_index()

# Handling Missing Values

In [164]:
#Impute missing values in numerical columns with the median
num_imputer = SimpleImputer(strategy='median')
housing_data[numeric_columns] = num_imputer.fit_transform(housing_data[numeric_columns])

#Impute missing values in categorical columns with the mode
cat_imputer = SimpleImputer(strategy='most_frequent')
housing_data[categorical_columns] = cat_imputer.fit_transform(housing_data[categorical_columns])

In [166]:
# Print confirmation
print("Missing values handled successfully!")
print(housing_data.isnull().sum())  # Verify no missing values remain

Missing values handled successfully!
MSSubClass              0
MSZoning                0
LotFrontage             0
LotArea                 0
Street                  0
                       ..
Neighborhood_SawyerW    0
Neighborhood_Somerst    0
Neighborhood_StoneBr    0
Neighborhood_Timber     0
Neighborhood_Veenker    0
Length: 182, dtype: int64


In [170]:
# Drop any remaining missing values
housing_data.dropna(inplace=True)


In [174]:
#Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(housing_data.select_dtypes(include=['number']))

In [176]:
#Dimensionality Reduction using PCA
pca = PCA(n_components=5)  # Reduce to 5 principal components
data_pca = pca.fit_transform(scaled_features)

In [178]:
#Polynomial Feature Interactions
poly = PolynomialFeatures(degree=2, interaction_only=True)
data_interactions = poly.fit_transform(housing_data[['YearRemodAdd']])

In [186]:
#Final Data 
print("Final Data Shape:", housing_data.shape)
print("Final Data Sample:")
print(housing_data.head())

Final Data Shape: (1460, 182)
Final Data Sample:
  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape LandContour  \
0       SC60       RL         65.0   8450.0   Pave  Grvl      Reg         Lvl   
1       SC20       RL         80.0   9600.0   Pave  Grvl      Reg         Lvl   
2       SC60       RL         68.0  11250.0   Pave  Grvl      IR1         Lvl   
3       SC70       RL         60.0   9550.0   Pave  Grvl      IR1         Lvl   
4       SC60       RL         84.0  14260.0   Pave  Grvl      IR1         Lvl   

  Utilities LotConfig  ... Neighborhood_NoRidge Neighborhood_NridgHt  \
0    AllPub    Inside  ...                  0.0                  0.0   
1    AllPub       FR2  ...                  0.0                  0.0   
2    AllPub    Inside  ...                  0.0                  0.0   
3    AllPub    Corner  ...                  0.0                  0.0   
4    AllPub       FR2  ...                  1.0                  0.0   

  Neighborhood_OldTown Neighbor

In [184]:
housing_data.to_csv("housing_data1.csv", index=False)