### More about handling null values [here](https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python/notebook)!

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Display the dataframe
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [4]:
X = pd.read_csv('data/train.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
# Predictor variable y
y = X.SalePrice
# Input variables
X.drop(['SalePrice'], axis=1, inplace=True)

In [5]:
# total count and percentage of missing values per column
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

missingValuesInfo(X)

Unnamed: 0,Total,Percent
PoolQC,1453,99.52
MiscFeature,1406,96.3
Alley,1369,93.77
Fence,1179,80.75
FireplaceQu,690,47.26
LotFrontage,259,17.74
GarageType,81,5.55
GarageYrBlt,81,5.55
GarageQual,81,5.55
GarageCond,81,5.55


### Dropping columns

In [None]:
X.drop(['Utilities'], axis=1, inplace=True)

### Simple imputing

In [None]:
# PoolQC: data description says NA means "No Pool".
X["PoolQC"] = X["PoolQC"].fillna("None")

# Alley : data description says NA means "no alley access"
X["Alley"] = X["Alley"].fillna("None")

# MSZoning (The general zoning classification): 'RL' is by far the most common value. So we can fill in missing values with 'RL'
X['MSZoning'] = X['MSZoning'].fillna(X['MSZoning'].mode()[0])


In [None]:
# GarageYrBlt, GarageArea and GarageCars: replacing missing values with 0 (Since no garage = no cars in such garage)

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    X[col] = X[col].fillna(0)

### More advanced imputing using $groupby$

In [None]:
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
X["LotFrontage"] = X.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

### Feature engineering

- Transforming some numerical variables that are really categorical
- Label Encoding some categorical variables that may contain information in their ordering set
- Adding new features


In [None]:
#MSSubClass=The building class
X['MSSubClass'] = X['MSSubClass'].apply(str)

#Changing OverallCond into a categorical variable
X['OverallCond'] = X['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
X['YrSold'] = X['YrSold'].astype(str)
X['MoSold'] = X['MoSold'].astype(str)