In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Read the csv file

In [0]:
train_df = pd.read_csv("Housing_Train.csv")

## Overview of the Data

In [0]:
train_df.head(5)

### Get Size of the dataset

In [0]:
print(train_df.shape)

## Missing Data

In [0]:
train_df_na = (train_df.isnull().sum() / len(train_df)) * 100  # Get percentage of missing data (Nan or None) in each columns 
train_df_na = train_df_na.drop(train_df_na[train_df_na == 0].index).sort_values(ascending=False)
train_df_na=pd.DataFrame(train_df_na)

In [0]:
train_df_na.columns=['Missing_Percent']

In [0]:
train_df_na

## Drop any feature with more than 10 % missing data, also drop the id column

In [10]:
feature_drop= train_df_na[train_df_na.Missing_Percent>10].index
feature_drop

Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
       'LotFrontage'],
      dtype='object')

In [0]:
train_df=train_df.drop(columns=feature_drop)
train_df=train_df.drop(columns=['Id'])
train_df.shape

In [0]:
train_df.head(5)

In [0]:
train_df.shape

## Impute the rest of the missing data with the mean

In [16]:
train_df["MasVnrType"].unique()

array(['BrkFace', 'None', 'Stone', 'BrkCmn'], dtype=object)

In [0]:
train_df["MasVnrType"] = train_df["MasVnrType"].fillna("None")


In [0]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    train_df[col] = train_df[col].fillna('None')

In [0]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train_df[col] = train_df[col].fillna('None')

In [0]:
train_df["MasVnrArea"] = train_df["MasVnrArea"].fillna(0)

In [0]:
train_df['Electrical'] = train_df['Electrical'].fillna(train_df['Electrical'].mode()[0])


In [0]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    train_df[col] = train_df[col].fillna(0)

In [0]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    train_df[col] = train_df[col].fillna(0)

## Check for any missing data

In [0]:
train_df.isnull().sum().max()

## Adjust Skew

In [0]:
import seaborn as sns
sns.distplot(train_df['SalePrice'])

In [0]:
train_df['SalePrice']= np.log(train_df['SalePrice'])
sns.distplot(train_df['SalePrice'])

## Label Encoding

In [0]:
from sklearn.preprocessing import LabelEncoder


In [29]:
train_df['BsmtQual'].unique()

array(['Gd', 'TA', 'Ex', 'None', 'Fa'], dtype=object)

In [0]:
lbl = LabelEncoder() 
lbl.fit(list(train_df['BsmtQual'].values)) 
train_df['BsmtQual'] = lbl.transform(list(train_df['BsmtQual'].values))

In [31]:
train_df['BsmtQual'].unique()

array([2, 4, 0, 3, 1])

In [0]:
cols = ('BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train_df[c].values)) 
    train_df[c] = lbl.transform(list(train_df[c].values))

## Creating the dummie variables

In [0]:
pd.get_dummies(train_df['BsmtQual'])


In [0]:
train_df = pd.get_dummies(train_df)
print(train_df.shape)

## Create Training and Validation Sets

In [0]:
from sklearn .model_selection import train_test_split

In [0]:
Target_Variables= train_df.SalePrice
Target_Variables

In [0]:
train_df= train_df.drop(columns='SalePrice')

In [0]:
X_train, X_Valid, Y_train, Y_Valid=train_test_split(train_df,Target_Variables, test_size=0.25, random_state=4)