Importing Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Load datasets

In [2]:
train = pd.read_csv('train.csv')

# Basic description of dataset

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


Check missing values

In [6]:
missing = train.isnull().sum().sort_values(ascending=False)
missing[missing > 0]

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
MasVnrType,872
FireplaceQu,690
LotFrontage,259
GarageQual,81
GarageFinish,81
GarageType,81


Handling missing values in Garage Features

In [7]:
# Separate GarageYrBlt (numeric) from others (categorical)
garage_cat = ['GarageCond', 'GarageFinish', 'GarageType']
garage_num = ['GarageYrBlt']

# Fill categorical garage features with 'None'
for col in garage_cat:
    train[col] = train[col].fillna('None')

# Fill GarageYrBlt (numeric) with 0 (assuming no garage)
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(0)

Handling missing values in other Features

In [8]:
# Fill basement-related features with 'None' for missing values
basement_columns = ['BsmtFinType2', 'BsmtExposure', 'BsmtCond', 'BsmtFinType1']
for col in basement_columns:
    train[col] = train[col].fillna('None')

In [9]:
train.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)

In [10]:
train['MasVnrType'] = train['MasVnrType'].fillna('None')
train['FireplaceQu']= train['FireplaceQu'].fillna('None')

In [11]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].median())
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].median())


In [12]:
train['GarageQual'] = train['GarageQual'].fillna('None')
train['BsmtQual'] = train['BsmtQual'].fillna('None')


In [13]:
train['Electrical'] = train['Electrical'].fillna(train['Electrical'].mode()[0])


Checking again after handling missing values

In [14]:
# Check for any remaining missing values
missing = train.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])


Series([], dtype: int64)


#Feature Engineering

## Convert categorical to numerical (Label Encoding / One-Hot Encoding)

We label encode ordinal features such as ExterQual, BsmtQual, and KitchenQual. Then, we will perform one-hot encoding for the remaining categorical variables.

In [15]:
from sklearn.preprocessing import LabelEncoder

# Label encode some ordinal features
cols = ['ExterQual', 'BsmtQual', 'KitchenQual']
for col in cols:
    lbl = LabelEncoder()
    train[col] = lbl.fit_transform(train[col].astype(str))


## Create New Features
Now, let's create a few useful features like total square footage, house age, and whether the house was remodeled.

In [16]:
# Total square footage
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']

# Age of the house
train['HouseAge'] = train['YrSold'] - train['YearBuilt']

# Whether house was remodeled
train['Remodeled'] = (train['YearBuilt'] != train['YearRemodAdd']).astype(int)


## Log Transform Skewed Data
We'll check for any skewed numerical features and apply a log transformation where necessary (using np.log1p).

In [17]:
from scipy.stats import skew

numeric_feats = train.dtypes[train.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed = skewed_feats[abs(skewed_feats) > 0.75].index

# Apply log1p (log(1 + x))
train[skewed] = np.log1p(train[skewed])


One-Hot Encoding for other categorical variables

In [18]:
train = pd.get_dummies(train)


## Feature Scaling
We apply Standard Scaling to all numerical features to bring them to the same scale.

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric = train.select_dtypes(include=[np.number]).columns
train[numeric] = scaler.fit_transform(train[numeric])


## Target Variable Transformation
For the target variable (SalePrice), we apply a log transformation to make the distribution more normal.

In [20]:
# Check and remove rows with missing or zero/negative SalePrice (shouldn't happen in this dataset, but safe to check)
train = train[train['SalePrice'].notnull()]
train = train[train['SalePrice'] > 0]

# Apply log1p safely
train['SalePrice'] = np.log1p(train['SalePrice'])
