In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv('train.csv')
df.shape

In [None]:
df.info()

*Separating Categorical from Continuous Variables*

In [None]:
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')
print (df_cat.shape, df_num.shape)

In [None]:
df_cat.columns

In [None]:
#Checking correlation between numerical variables and SalePrice
df_num.corr()['SalePrice'].sort_values(ascending=False)

In [None]:
sns.scatterplot(data=df_num, x='OverallQual', y='SalePrice')

In [None]:
sns.distplot(df['SalePrice'])

In [None]:
df.shape

In [None]:
#Find outliers in SalePrice basis some reasonable IQR. Take care that Outliers should not be more than 2% of the total dataseta
Q1 = df['SalePrice'].quantile(0.25)
Q3 = df['SalePrice'].quantile(0.75)
IQR = Q3 - Q1
df['SalePrice'].describe()
Out_Upper = Q3 + 2.5 * IQR
Qut_Lower = Q1 - 2.5 * IQR
#Find outliers in SalePrice basis Out_Upper and Out_Lower
print (f"We are dropping {df[(df['SalePrice'] > Out_Upper) | (df['SalePrice'] < Qut_Lower)].shape[0]} number of records out of {df.shape[0]}")

In [None]:
df = df[(df['SalePrice'] < Out_Upper) & (df['SalePrice'] > Qut_Lower)]
df.shape

In [None]:
sns.scatterplot(data=df, x='OverallQual', y='SalePrice')

In [None]:
drop_index = df[(df['SalePrice'] < 200000) & (df['OverallQual'] == 10)].index
df.drop(drop_index, inplace=True)
df.shape

In [None]:
sns.scatterplot(data=df, x='GrLivArea', y='SalePrice')

In [None]:
sns.scatterplot(x = df['GarageArea'], y = df['SalePrice'])

In [None]:
drop_index = (df[(df['GarageArea'] > 1200) & (df['SalePrice'] < 200000)]).index

In [None]:
df = df.drop(drop_index, axis=0)

In [None]:
df.shape

In [None]:
sns.scatterplot(x = df['FullBath'], y = df['SalePrice'])

### Categorical Features which seem to have a correlation with sales price basis boxplot

In [None]:
sns.boxplot(x = df['MSZoning'], y = df['SalePrice'])

***Dealing with Missing Data***

In [None]:
#Finding missing data
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Check missing data
df.isnull().sum().sort_values(ascending=False).head(19)

In [None]:
df['PoolQC'].value_counts()

In [None]:
sns.boxplot(x = df['PoolQC'], y = df['SalePrice'])

In [None]:
# Checked documentation to see that Nan in PoolQC means no pool
df['PoolQC'].fillna('No Pool', inplace=True)

In [None]:
df.drop('MiscFeature', axis='columns', inplace=True)

In [None]:
sns.boxplot(x = df['Alley'], y = df['SalePrice'])

In [None]:
df['Alley'].value_counts()

***Choosing to replace NA in Alley since this feature seems to have a positive linear correlation with SalesPrice***

In [None]:
df['Alley'].fillna('No Alley', inplace=True)
df['Alley'].value_counts()

In [None]:
df['Fence'].value_counts()

In [None]:
sns.boxplot(x = df['Fence'], y = df['SalePrice'])

In [None]:
# Dropping the ID column since it does not contain any useful information
df.drop('Id', inplace=True, axis = 'columns')

In [None]:
sorted(df.columns.values)

In [None]:
sns.boxplot(x = df['Utilities'], y = df['SalePrice'])  

In [None]:
df['Utilities'].value_counts()
# Utilities does not seem to be a predictor for SalePrice. Hence, we can drop it
df.drop('Utilities', axis='columns', inplace=True)

In [None]:
sns.boxplot(x = df['Condition2'], y = df['SalePrice'])  

In [None]:
df['Condition2'].value_counts()
# Taking a call to drop Condition2 as it does not seem to be a predictor for SalePrice

In [None]:
df.drop('Condition2', axis='columns', inplace=True)

In [None]:
df['RoofMatl'].value_counts()

In [None]:
# Dropping RoofMatl as it does not seem to be a predictor for SalePrice and is centred around one value
df.drop('RoofMatl', axis='columns', inplace=True)

In [None]:
#Boxplot for Exterior1st, make categories on x axis readable
plt.figure(figsize=(20, 10))
sns.boxplot(x = df['Exterior1st'], y = df['SalePrice'])

In [None]:
df['Exterior1st'].value_counts()

In [None]:
#Replacing BrkComm, Stone, AsphShn, CBlock, ImStucc with Other
df['Exterior1st'].replace(['BrkComm', 'Stone', 'AsphShn', 'CBlock', 'ImStucc'], 'Other', inplace=True)
df['Exterior1st'].value_counts()

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(x = df['Exterior2nd'], y = df['SalePrice'])

In [None]:
df['Exterior2nd'].value_counts()

In [None]:
#Replace ImStucc, Brk Cmn, Stone, AsphShn, CBlock, Other with Other
df['Exterior2nd'].replace(['ImStucc', 'Brk Cmn', 'Stone', 'AsphShn', 'CBlock', 'Other'], 'Other', inplace=True)
df['Exterior2nd'].value_counts()

In [None]:
sns.boxplot(x = df['MasVnrType'], y = df['SalePrice'])

In [None]:
df['MasVnrType'].value_counts()

In [None]:
#Replace missing values in MasVnrType with BrkFace as it is the most common value
df['MasVnrType'].fillna('BrkFace', inplace=True)

In [None]:
sns.boxplot(x = df['ExterCond'], y = df['SalePrice'])

In [None]:
df['ExterCond'].value_counts()

In [None]:
df['ExterCond'].replace(['Po', 'Ex'], 'Gd', inplace=True)
df['ExterCond'].value_counts()

In [None]:
sns.boxplot(x = df['BsmtQual'], y = df['SalePrice'], order=['Fa', 'TA', 'Gd', 'Ex'])

In [None]:
df['BsmtQual'].value_counts()

In [None]:
sns.boxplot(x = df['Street'], y = df['SalePrice'])

In [None]:
df.drop('Street', axis='columns', inplace=True)

In [None]:
df['MasVnrType'].value_counts()

In [None]:
df.isnull().sum().sort_values(ascending=False).head(5) * 100/ df.shape[0]

In [None]:
sns.boxplot(x = df['FireplaceQu'], y = df['SalePrice'])  

In [None]:
df['Fence'].fillna('No Fence', inplace=True)

In [None]:
sns.boxplot(x=df['Fence'], y = df['SalePrice'])

In [None]:
df['FireplaceQu'].fillna('No Fireplace', inplace=True)
sns.boxplot(x = df['FireplaceQu'], y = df['SalePrice'])  

In [None]:
sns.scatterplot(x = df['LotFrontage'], y = df['SalePrice'])

In [None]:
df[df['LotFrontage'].isnull()]
#Explore

In [None]:
sns.displot(df['LotFrontage'])

In [None]:
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)

In [None]:
df['GarageFinish'].fillna('No Garage', inplace=True)
df['GarageCond'].fillna('No Garage', inplace=True)
df['GarageType'].fillna('No Garage', inplace=True)
df['GarageYrBlt'].fillna('No Garage', inplace=True)
df['GarageQual'].fillna('No Garage', inplace=True)
df['GarageCond'].fillna('No Garage', inplace=True)
df['GarageType'].fillna('No Garage', inplace=True)

In [None]:
sns.boxplot(x=df['Electrical'], y = df['SalePrice'])

In [None]:
df.dropna(subset=['Electrical'], inplace=True, axis = 'rows')

In [None]:
sns.scatterplot(x=df['MasVnrArea'], y = df['SalePrice'])

In [None]:
sns.histplot(x = df['MasVnrArea'], bins = 5)

In [None]:
df['MasVnrArea'].fillna(0, inplace=True)

In [None]:
df[df['BsmtFinType1'].isnull()]

In [None]:
df.isnull().sum().sort_values(ascending=False).head(4) * 100/ df.shape[0]

In [None]:
#Set value for BsmtFinType2 to Unf for index 332
df.loc[332, 'BsmtFinType2'] = 'Unf'

In [None]:
df['BsmtFinType2'].value_counts()

In [None]:
df['BsmtFinType2'].fillna('No Basement', inplace=True)
df['BsmtExposure'].fillna('No Basement', inplace=True)
df['BsmtQual'].fillna('No Basement', inplace=True)
df['BsmtCond'].fillna('No Basement', inplace=True)
df['BsmtFinType1'].fillna('No Basement', inplace=True)

In [None]:
df.shape

***Dropping features basis predictive power***

In [None]:
sns.regplot(x = df['YrSold'], y = df['SalePrice'])
df.drop('YrSold', axis='columns', inplace=True)

In [None]:
sns.regplot(x = df['MoSold'], y = df['SalePrice'])
df.drop('MoSold', inplace=True, axis = 'columns')

In [None]:
sns.regplot(x = df['MiscVal'], y = df['SalePrice'])
#Dropping MiscVal since it does not seem to have strong predictive power
df.drop('MiscVal', inplace=True, axis='columns')

In [None]:
sns.regplot(x=df['BsmtFinSF2'], y = df['SalePrice'])
#Since BsmtFinSF2 does not seem to have any predictive value, dropping the column
df.drop('BsmtFinSF2', inplace=True, axis = 'columns')
df.shape

***Standardisation***

In [None]:
# df_num = df.select_dtypes(exclude='object')
# df_num.columns.values

In [None]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df[df_num.columns.values] = scaler.fit_transform(df[df_num.columns.values])
# df