In [None]:
# importing baseline packages
import os

# importing data processing packages
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# importing machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# importing visualisation libraries
import matplotlib.pyplot as plt
plt.style.use('../conf/custom_mpl_stylesheet.mplstyle')

# import custom classes
from custom_utils import DataPrepUtil

In [None]:
drop_cols = ['Id', 'SalePrice']
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
X_train, X_val, y_train, y_val = train_test_split(train.drop(['Id', 'SalePrice'], axis=1), train['SalePrice'], test_size=0.2, random_state=43)

In [None]:
datautil = DataPrepUtil()

In [None]:
missing_val_cols = [var for var in train.columns if train[var].isna().sum() > 0]
train[missing_val_cols]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=100)
axes[0].bar(train.groupby('MoSold')['SalePrice'].mean().index, train.groupby('MoSold')['SalePrice'].mean(), color='turquoise', width=0.7, alpha=0.75)
axes[0].set_xlabel('MoSold')
axes[0].set_ylabel('SalePrice')
axes[0].set_title('Mean sale price per month sold')
axes[1].bar(train.groupby('MoSold')['SalePrice'].median().index, train.groupby('MoSold')['SalePrice'].median(),  color='turquoise', width=0.7, alpha=0.75)
axes[1].set_xlabel('MoSold')
axes[1].set_ylabel('SalePrice')
axes[1].set_title('Median sale price per month sold')

plt.tight_layout()
plt.show()

In [None]:
datautil.plot_filled_values_percent(train[missing_val_cols], dpi=130)

In [None]:
# list of categorical variables
cat_cols = [var for var in train.columns if train[var].dtype == 'O']

# list of the non-categorical variables
non_cat_cols = [var for var in train.columns if train[var].dtype !='O']

# list of variables that contain date & time information
dt_cols = [var for var in train.columns if 'Yr' in var or 'Year' in var]

# discrete numerical variables (i.e., numerical variables with a finite small set of distinct values (<20)
disc_cols = [var for var in non_cat_cols if train[var].nunique() < 25 and var not in dt_cols]

# list of numerical variables
num_cols = [var for var in non_cat_cols if var not in disc_cols and var not in drop_cols]

In [None]:
disc_cols.sort()

In [None]:
cardinality = train[cat_cols].nunique()
train[cardinality.index[0]].count()

In [None]:
datautil.plot_cat_col_cardinality(train[cat_cols], dpi=300)

## Data Imputation

In [None]:
X_train[disc_cols] = X_train[disc_cols].astype('O')
X_val[disc_cols] = X_val[disc_cols].astype('O')

In [ ]:
ml_pipeline = Pipeline([
    ('missing_indicator', mdi.AddMissingIndicator(
        variables=non_cat_cols
    ))  
])