# Inital Preprocessing

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

## Read in Training Data

In [None]:
train_path = '/data/housing_prices/input/train.csv.gz'
test_path  = '/data/housing_prices/input/test.csv.gz'

In [None]:
y_col, id_col = 'SalePrice', 'Id'

In [None]:
x_train = (pd.read_csv(train_path)   
    .dropna(subset=[y_col])
    .set_index(id_col)
)

In [None]:
y_train = x_train[y_col]
x_train = x_train.drop(y_col, axis=1)

In [None]:
all_cols = x_train.columns

In [None]:
x_train_copy = x_train.copy()

## Split Features by dtype

In [None]:
x_train.dtypes.value_counts()

In [None]:
flt_cols = all_cols[x_train.dtypes.astype(str).isin(['float64'])].tolist()

In [None]:
int_cols = all_cols[x_train.dtypes.astype(str).isin(['int64'])].tolist()

In [None]:
cat_cols = all_cols[x_train.dtypes.astype(str).isin(['object'])].tolist()

## Normalize Category Labels

In [None]:
def normalize_levels(X: pd.DataFrame, cat_cols: list) -> pd.DataFrame:    
    def normalize(x):
        if x.name in cat_cols:
            return x.str.lower().replace('\s+', '', regex=True)
        return x
    return X.apply(normalize)

In [None]:
txtnorm_step1 = FunctionTransformer(normalize_levels, kw_args={'cat_cols': cat_cols})

Replace labels that appear in the train data but don't match the data dictionary. The [below](#Encode-Categorical-Data) were discovered below when attempting to encode the categorical fields with the information provided in the data dictionary.

In [None]:
replace_lvls = {
    'Exterior2nd': {'wdshng': 'wdshing', 'cmentbd': 'cemntbd', 'brkcmn': 'brkcomm'},
    'MSZoning': {'c(all)': 'c'},
    'BldgType': {'duplex': 'duplx', 'twnhs': 'twnhsi'}
}

In [None]:
def replace_levels(X: pd.DataFrame, replace_map: dict) -> pd.DataFrame:
    def replace(x):
        if x.name in replace_map:
            return x.replace(replace_map[x.name])
        return x
    return X.apply(replace)

In [None]:
txtnorm_step2 = FunctionTransformer(replace_levels, kw_args={'replace_map': replace_lvls})

In [None]:
txtnorm_trans = Pipeline(steps=[
    ('step1', txtnorm_step1), ('step2', txtnorm_step2)
])

In [None]:
x_train = txtnorm_trans.fit_transform(x_train)

## Missing Values Stage I

From the data dictionary, NA is often used to represent a "missing" category rather than unknown information.  There are also instances where if one column is NA then other columns should simultaneousy be NA.  Therefore, data imputationwill require a two-staged strategy:

 - replace NA-as-a-level values with a label
 - fill actual missing NA values
 

Here we focus on the first stage.  Each set among the following should have consistent missing categorization. 

```
("MasVnrType", "MasVbrArea")
("MiscFeature", "MiscVal")
("PoolQC", "PoolArea")
("Fireplaces", "FireplaceQu") 
("GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond")
("BsmtCond", "BsmtQual", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtFullBath", "BsmtHalfBath", "BsmtUnfSF")
```

We create the below data structure to hold the relationship:

```
["column that determines NA Status", {"column that depends on status": "NA label"}]
```
This setup allows for multiple columns to depend on one column and for each dependent column to have it's own replacement value.  It also allows for a columns that are only dependent on their own values (for example see `Alley`).

In [None]:
impute_map = [
    ['Alley', {
        'Alley': 'na'
    }],
    ['BsmtCond', {
        'BsmtQual': 'na'
        , 'BsmtExposure': 'na'
        , 'BsmtFinType1': 'na'
        , 'BsmtFinSF1': 0.0
        , 'BsmtFinType2': 'na'
        , 'BsmtFinSF2':  0.0
        , 'BsmtFullBath': 0.0
        , 'BsmtHalfBath': 0.0
        , 'BsmtUnfSF': 0.0
    }],
    ['BsmtCond', {
        'BsmtCond': 'na'
    }],
    ['GarageType', {
        'GarageYrBlt': -1.0
        , 'GarageFinish': 'na' 
        , 'GarageCars': 0.0
        , 'GarageArea': 0.0
        , 'GarageQual': 'na'
        , 'GarageCond': 'na'
        , 'GarageYrBlt': -1.0
    }], 
    ['GarageType', {
        'GarageType': 'na'
    }],
    ['Fence', {
        'Fence': 'na'
    }],
    ['FireplaceQu', {
        'Fireplaces': 'na'
    }],
    ['FireplaceQu', {
        'FireplaceQu': 'na'
    }],
    ['MasVnrType', {
        'MasVnrArea': 0.0
    }],
    ['MasVnrType', {
        'MasVnrType': 'none'
    }],
    ['MiscFeature', {
        'MiscVal': 0.0
    }],
    ['MiscFeature', {
        'MiscFeature': 'na'
    }],
    ['PoolQC', {
        'PoolQC': 'na'
    }]
]

The following function is compatible with sklearn pipeline and was written to accomodate the above data structure.

In [None]:
def conditional_fill(X: pd.DataFrame, impute_map: dict) -> pd.DataFrame:
    """ 
    """
    Xcopy = X.copy()
    def fill_column(x, na_loc, fill_val):
        try:
            return np.where(na_loc & x.isna(), fill_val[x.name], x)
        except KeyError:
            return x
    
    for k, v in impute_map:
        Xcopy = Xcopy.apply(fill_column, na_loc=Xcopy[k].isna(), fill_val=v)

    return Xcopy

In [None]:
impute_step1 = FunctionTransformer(conditional_fill, kw_args={'impute_map': impute_map})

In [None]:
x_train = impute_step1.fit_transform(x_train)

In [None]:
x_train.isna().sum()[x_train.isna().sum() > 0]

## Get Levels from Data Dictionary

There are many oridinal fields in the data.  The data dictionary contains the ordinal values in a reasonable order that was derived from some process outside of the data. Let's make that something we can apply directly in data preprocessing in a pipeline.

Some data is probably better off one-hot encoded so we won't need the ordinal information ... but just grabbing it anyway.

In [None]:
dictionary_path = '/data/housing_prices/input/data_description.txt'

In [None]:
def record_feed(filepath: str):
    """ combine lines into chunks and split chunks based
        on appearance of ':' which indicates a new field
        definition
    """
    def detect_field(line):
        tokens = line.split(' ')
        return tokens[0].endswith(':')

    group = []
    for i, line in enumerate(open(filepath)):
        line = line.strip()
        if i > 0 and detect_field(line):
            yield group
            group = []
        if line: 
            group.append(line)
    yield group

In [None]:
def make_levels(record: str) -> dict:
    """ create a dict of {'field': ['levels']} by 
        splitting on ':' for fields and '\t' for levels
    """
    def clean_lvl(line):
        lvl = line.split('\t')[0]
        return lvl.strip().lower().replace(' ', '')
    
    if len(record) > 1:
        cat, *lvls = record
        return cat.split(':')[0], [clean_lvl(lvl) for lvl in lvls]
    return record[0].split(':')[0], None

In [None]:
levels = {
    k: v for k, v in map(make_levels, list(record_feed(dictionary_path))) 
    if v and k in cat_cols
}

## Encode Categorical Data

In [None]:
def ordinal_encoder(X:pd.DataFrame, encodings: dict) -> pd.DataFrame:
    """
    """
    x_train = X.copy()
    for k, levels in encodings.items():
        encoding = {lvl: i for i, lvl in enumerate(levels)}
        x_train[k] = x_train[k].map(encoding)
    return x_train

In [None]:
ordinal_trans = FunctionTransformer(ordinal_encoder, kw_args={'encodings': levels})

In [None]:
x_train = ordinal_trans.fit_transform(x_train)

Make sure we didn't introduce any new NA values.

In [None]:
x_train.isna().sum()[x_train.isna().sum() > 0]

## Missing Values Stage II

In [None]:
x_train.loc[x_train['BsmtExposure'].isna(), x_train.columns[x_train.columns.str.startswith('Bsmt')]]

In [None]:
x_train.loc[x_train['BsmtFinType2'].isna(), x_train.columns[x_train.columns.str.startswith('Bsmt')]]

In [None]:
x_train.loc[x_train['Electrical'].isna(), :]

`BsmtExposure`, `BsmtFinType2` and `Electrical` seem like data entry errors.  `LotFrontage` is missing.  

We can use keep thing simple and use `median` strategy for `LotFrontage` and `mode` for the others.

In [None]:
from sklearn.compose import make_column_transformer

In [None]:
impute_stage2 = ColumnTransformer([
    ('median', SimpleImputer(strategy='median'), ['LotFrontage']),
    ('mode', SimpleImputer(strategy='most_frequent'), ['BsmtExposure', 'BsmtFinType2', 'Electrical'])
], remainder='passthrough', )

## Operationalize


Now that we've identified a baseline set of operations that are required before baseline modeling, let's organize into a single pipeline, test, and save artifacts necessary for replicating the process outside of this notebook.

In [None]:
preprocessing_trans = Pipeline(steps=[
    ('txtnorm_trans', txtnorm_trans),
    ('impute_step1', impute_step1),
    ('ordinal_trans', ordinal_trans),
    ('impute_stage2', impute_stage2)
])

In [None]:
x_train_copy = preprocessing_trans.fit_transform(x_train_copy)

In [None]:
x_train_copy.shape

## Save Artifacts

In [None]:
feature_meta = {
    'flt_cols': flt_cols,
    'int_cols': int_cols,
    'cat_cols': cat_cols,
    'replace_lvls': replace_lvls,
    'impute_map': impute_map,
    'levels': levels
}

In [None]:
import json

In [None]:
with open('/data/housing_prices/input/feature_meta.json', 'w') as f:
    json.dump(feature_meta, f)

In [None]:
test = json.load(open('/data/housing_prices/input/feature_meta.json'))