In [None]:
# Loading neccesary packages:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#

from scipy import stats
from scipy.stats import skew, boxcox_normmax, norm
from scipy.special import boxcox1p

#

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
# experiment class
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


#

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

#

import warnings
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

# Meeting the data
We're going to start by loading the data and taking first look on it as usual. For the column names we have great dictionary file in our dataset location so we can get familiar with them in no time.

In [None]:
# Loading datasets.

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train

In [None]:
test

In [None]:
train.describe()

* Id column looks useless we can safely drop it from both. I'm going to save our target (SalePrice) on different variable so we can use it in future.

In [None]:
# Dropping unnecessary Id column.

train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [None]:
# Backing up target variables and dropping them from train data.

y = train.SalePrice.reset_index(drop=True)
train_features = train.drop('SalePrice', axis=1)
test_features = test

# Analysis Time!
Ok the short inspection at the beginning give us some hints how should we move from here. I'm going to play with the data we have while analysing the data at the same time. With this way I hope we can get the data in better shape while digging deeper into it.

We're going to start with basic correlation table here. I dropped the top part since it's just mirror of the other part below. With this table we can understand some linear relations between different features.

### Observations:
* There's strong relation between overall quality of the houses and their sale prices.
* Again above grade living area seems strong indicator for sale price.
* Garage features, number of baths and rooms, how old the building is etc. also having effect on the price on various levels too.
* There are some obvious relations we gonna pass like total square feet affecting how many rooms there are or how many cars can fit into a garage vs. garage area etc.
* Overall condition of the house seems less important on the pricing, it's interesting and worth digging.

In [None]:
# Display numerical correlations (pearson) between features on heatmap.

sns.set(font_scale=1.1)
correlation_train = train.corr()
mask = np.triu(correlation_train.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.1f',
            cmap='coolwarm',
            square=True,
            mask=mask,
            linewidths=1,
            cbar=False)

plt.show()

# delete unnecesory varieble 
del correlation_train, mask


* **I'm going to merge the datasets here before we start editing it so we don't have to do these operations twice. Let's call it features since it has features only. So our data has 2919 observations and 79 features to begin with...**

In [None]:
# Merging train test features for engineering.

features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)

# Missing Data
Alright, first of all we need detect missing values, then wee need to get rid of them for the next steps of our work. So let's list our missing values and visualize them:

In [None]:
def missing_percentage(df):
    
    """A function for returning missing ratios."""
    total = df.isnull().sum().sort_values(ascending=False)
    
    return pd.concat([total, (total / len(df) * 100)], axis=1, keys=['Total', 'Percent'])[total!=0]

* **That's quite a lot! No need to panic though we got this. If you look at the data description given to us we can see that most of these missing data actually not missing, it's just means house doesn't have that specific feature, we can fix that easily...**

In [None]:
# Checking 'NaN' values.

missing = missing_percentage(features)

fig, ax = plt.subplots(figsize=(20, 5))
sns.barplot(x=missing.index, y='Percent', data=missing, palette='Reds_r')
plt.xticks(rotation=90)

display(missing.T.style.background_gradient(cmap='Reds', axis=1))

del missing

# Ok this is how we gonna fix most of the missing data:
1. First we fill the NaN's in the columns where they mean 'None' so we gonna replace them with that,
2. Then we fill numerical columns where missing values indicating there is no parent feature to measure, so we replace them with 0's.
3. Even with these there are some actual missing data, by checking general trends of these features we can fill them with most frequent value(with mode).
4. MSZoning part is little bit tricky I choose to fill them with most common type of the related MSSubClass type. It's not perfect but at least we decrease randomness a little bit.
4. Again we fill the Lot Frontage with similar approach.

In [None]:

# Features which numerical on data but should be treated as category:

for i in ['MSSubClass', 'YrSold', 'MoSold']:
    features[i] = features[i].astype(str)


# List of 'NaN' including columns where NaN's mean's none.

none_cols = [
    'Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'
]

# List of 'NaN' including columns where NaN's mean's 0.

zero_cols = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
    'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea'
]

# List of 'NaN' including columns where NaN's actually missing gonna replaced with mode.

most_cols = [
    'Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual',
    'SaleType', 'Utilities', 'MSZoning'
]

# Multivariate feature imputation method
# for ref => https://scikit-learn.org/stable/modules/impute.html#univariate-feature-imputation

regg_cols = ['LotFrontage']#,'MSZoning']


In [None]:
# impute missing value with Column transformer

missing_value_preprocessor = ColumnTransformer(
    transformers=[
        # imputation
        ('none_imputer', SimpleImputer(fill_value= 'none', strategy='constant'), none_cols),
        ('zero_imputer', SimpleImputer(fill_value= 0, strategy='constant'), zero_cols),
        ('most_imputer', SimpleImputer(strategy='most_frequent'), most_cols),
        # experimental class imputation => Multivariate feature imputation
        ('regg_features', IterativeImputer(max_iter=10, random_state=0), regg_cols),
    ],
    remainder = 'passthrough',
)

In [None]:
features = pd.DataFrame(
    missing_value_preprocessor.fit_transform(features),
    columns = features.columns
)

# Feature Engineering
Ok this is the part where we dig deeper into our completed dataset. There are no missing values so we're good to go!


In [None]:
def show_box(y, df):
    
    '''A function for displaying categorical variables.'''
    
    fig, axes = plt.subplots(14, 3, figsize=(25, 80))
    axes = axes.flatten()
    
    for i, j in zip(df.select_dtypes(include=['object']).columns, axes):
        
        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()



# Categorical Data
We already checked some of the numerical features with correlation heatmap but what about categorical values? We want to see relations between categorical data and sale price. Boxplots seems decent way to inspect this type of relation. We're also going to sort them by the median value of that group so we can see the importances in descending order.

In [None]:
# Displaying sale prices vs. categorical values:

show_box('SalePrice', train)

# Numeric Data
There are many numeric features the inspect, one of the best ways to see how they effect sale prices is scatter plots. We're also plotting polynomial regression lines to see general trend. With this way we can understand the numerical values and their importance on sale price, also it's really helpful to spot outliers.

### Observations:
* OverallQual; It's clearly visible that sale price of the house increases with overall quality. This confirms the correlation in first table we did at the beginning. (Pearson corr was 0.8)

* OverallCondition; Looks like overall condition is left skewed where most of the houses are around 5/10 condition. But it doesn't effect the price like quality indicator...

* YearBuilt; Again new buildings are generally expensive than the old ones.

* Basement; General table shows bigger basements are increasing the price but I see some outliers there...

* GrLivArea; This feature is pretty linear but we can spot two outliers effecting this trend. There are some huge area houses with pretty cheap prices, there might be some reason behind it but we better drop them.

* SaleDates; They seem pretty unimportant on sale prices, we can drop them...

In [None]:
# Plotting numerical features with polynomial order to detect outliers by eye.

def show_reg(y, df):
    fig, axes = plt.subplots(12, 3, figsize=(25, 80))
    axes = axes.flatten()
    
    for i, j in zip(df.select_dtypes(include=['number']).columns, axes):

        sns.regplot(x=i,
                    y=y,
                    data=df,
                    ax=j,
                    order=3,
                    ci=None,
                    color='#e74c3c',
                    line_kws={'color': 'black'},
                    scatter_kws={'alpha':0.4})
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))

        plt.tight_layout()

show_reg('SalePrice', train)

In [None]:
features = features.convert_dtypes()
features.select_dtypes(exclude=['string'])
train

In [None]:
#features.select_dtypes(include=['object'])
#features.dtypes
from sklearn import set_config
set_config(display='diagram')   
# displays HTML representation in a jupyter context
missing_value_preprocessor
