In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
pd.options.display.max_rows = 100

In [2]:
df = pd.read_csv('train.csv')

Initial step after reading the data into a data frame is to gain some understanding of what the data is and what it is not. In this case, the data description file included does a fantastic job in breaking down each feature and what the values of each feature mean. We know the dataset includes all the typical information someone might look at initially when reviewing a listing. With 80 features available, there is plenty of information to use when determining what helps us predict sale price. 

After reviewing the data description file, it is time to look at the information in the dataframe. 

In [None]:
df.shape

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
df.isna().sum()/len(df)

In [None]:
df.describe()

In [None]:
num_df = df.select_dtypes(include=['number'])

#Correlation Matrix 
corr_matrix = num_df.corr()

In [None]:
plt.figure(figsize=(30,30))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of Numeric Features')
plt.show

In [None]:
threshold = 0.6  # Example threshold for high correlation

# Identifying high correlations without duplicates and self-correlations
high_corr_pairs = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
filtered_pairs = high_corr_pairs.stack().reset_index()
filtered_pairs.columns = ['Feature1', 'Feature2', 'Correlation']
filtered_pairs = filtered_pairs[abs(filtered_pairs['Correlation']) > threshold]

In [None]:
# Create a pivot table
pivot_table = filtered_pairs.pivot(index='Feature1', columns='Feature2', values='Correlation')

In [None]:
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_table, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of High Correlation Pairs')
plt.show()


In [None]:
# Histogram for all the Numeric Columns
num_df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

In [None]:
sns.pairplot(df[['1stFlrSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual', 'TotalBsmtSF', 'SalePrice']])

In [3]:
cat_df = df.select_dtypes(include=['object'])

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [4]:
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 43 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSZoning       1460 non-null   object
 1   Street         1460 non-null   object
 2   Alley          91 non-null     object
 3   LotShape       1460 non-null   object
 4   LandContour    1460 non-null   object
 5   Utilities      1460 non-null   object
 6   LotConfig      1460 non-null   object
 7   LandSlope      1460 non-null   object
 8   Neighborhood   1460 non-null   object
 9   Condition1     1460 non-null   object
 10  Condition2     1460 non-null   object
 11  BldgType       1460 non-null   object
 12  HouseStyle     1460 non-null   object
 13  RoofStyle      1460 non-null   object
 14  RoofMatl       1460 non-null   object
 15  Exterior1st    1460 non-null   object
 16  Exterior2nd    1460 non-null   object
 17  MasVnrType     588 non-null    object
 18  ExterQual      1460 non-null

Some of these columns are ordinal and should be treated as such. ExterQual, ExterCond, BsmtQual, BsmtCond, HeatingQC, KitchenQual, FireplaceQu, GarageQual, and GarageCond. We can separate these columns and use the Ordinal Encoder. Other categories can be encoded using the Target Encoder.

In [8]:
ordcat_df = df[['Id', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']]
ordcat_df

Unnamed: 0,Id,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageQual,GarageCond
0,1,Gd,TA,Gd,TA,Ex,Gd,,TA,TA
1,2,TA,TA,Gd,TA,Ex,TA,TA,TA,TA
2,3,Gd,TA,Gd,TA,Ex,Gd,TA,TA,TA
3,4,TA,TA,TA,Gd,Gd,Gd,Gd,TA,TA
4,5,Gd,TA,Gd,TA,Ex,Gd,TA,TA,TA
...,...,...,...,...,...,...,...,...,...,...
1455,1456,TA,TA,Gd,TA,Ex,TA,TA,TA,TA
1456,1457,TA,TA,Gd,TA,TA,TA,TA,TA,TA
1457,1458,Ex,Gd,TA,Gd,Ex,Gd,Gd,TA,TA
1458,1459,TA,TA,TA,TA,Gd,Gd,,TA,TA


In [11]:
ordenc = OrdinalEncoder()
ordenc.fit(ordcat_df)