# Imports & Config

In [1]:
import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from dotenv import dotenv_values
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor

config = dotenv_values(".env")

px.set_mapbox_access_token(config['MAPBOX_TOKEN'])
pio.templates.default = "plotly_white"

np.random.seed(711)

In [2]:
df = pd.read_csv('train.csv')

pd.options.display.max_rows = len(df.columns) + 50

# Modeling

**Pipeline**:

1. Set Pipeline Options:
- What Data Fixing Operations should be used (bool flags)
- What new features (via methods) should be added
- Scaling options to use (bool flags)
- Model hyperparams

2. Data Fixing
- Remove inessential columns
- Fix NA/missing values
- Fix erroneous values (e.g. "None" vs None)
- Remove outliers *with care*

3. Feature Engineering
- Add new features

4. Feature Scaling
- Convert categorical features
- (optional) Scale/Normalize features
- Standardize features

5. Model preparation
- split train/test (use k-folds)
- set hyper-params (or use grid/random search)
- set metrics and visual analysis

## Feature Gen & Helpers

In [3]:
class NewFeatures:

    # Adds the month number, e.g. second month of seconds year = (1 * 12) + 2 = 14
    @staticmethod
    def add_month_number(df):
        '''
        Returns a new DataFrame with addded column "month_number". The month_number starts at 1 and increments to the final month.
        For instance, if the earliest date is January 2005, February 2005 = 2 and February 2006 = 14.
        '''
        df.YrSold = pd.to_numeric(df.YrSold)
        df.MoSold = pd.to_numeric(df.MoSold)
        base_year = df.YrSold.min()
        df['month_number'] = (df.YrSold - base_year)*12 + df.MoSold

        return df

In [188]:
class Helpers:

    # Returns two DataFrames, a training set with all months n-1, and a test set with just month n
    @staticmethod
    def separate_final_month(df):
        '''
        Returns two new DataFrames. The first DataFrame includes all records & columns except records from the final month (as determined by col month_number). 
        The second DataFrame includes all records & columns from the final month.
        '''
        df = df.copy()
        if 'month_number' not in df:
            df = NewFeatures.add_month_number(df)
        
        train_set = df.loc[df.month_number<df.month_number.max()]
        test_set  = df.loc[df.month_number==df.month_number.max()]

        return train_set, test_set

    @staticmethod
    def split_x_y(df):
        x = df.drop(columns=['SalePrice']).values
        y = df.SalePrice.values.reshape((len(df), 1))
        return x, y

    @staticmethod
    def convert_to_ordinal(df, columns, ignore_na=False):
        df = df.copy()
        for col in columns:
            unique_values = df[col].unique()
            n_categories  = len(unique_values)

            ordinal_map = {v:i for v, i in zip(unique_values, range(1, n_categories+1))}
            
            df[col] = df[col].replace(ordinal_map)
            if not ignore_na:
                df[col] = df[col].fillna(0)

        return df

    @staticmethod
    def convert_to_truncated_ordinal(df, columns, ignore_na=False, threshold=0.01):
        '''
        Similar to convert_to_ordinal. This method will assess whether any of the potential values
        are highly uncommon (<1%) and group into an "other" value.
        '''
        for col in columns:
            value_pcts    = df[col].value_counts() / len(df)
            vals_to_group = value_pcts[value_pcts < threshold].index.values
            val_map = {val:'other' for val in vals_to_group}
            df[col] = df[col].replace(val_map)
        
        return Helpers.convert_to_ordinal(df, columns, ignore_na=ignore_na)

    @staticmethod
    def convert_to_one_hot(df, columns, ignore_na=False, drop_original_cols=True):
        df = df.copy()
        if columns is None:
            columns = df.columns
        onehot_cols = pd.get_dummies(df.loc[:, columns], columns=columns, dummy_na=(not ignore_na))
        df = df.join(onehot_cols)
        if drop_original_cols:
            df = df.drop(columns=columns)

        return df


## Pipeline

In [301]:
# Just used as a namespace
class PerColAdj:

    def __init__(self):
        pass

In [547]:
class DataFixing:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def fix(self, df):
        # Loops through all class methods and calls the ones with prefix "_"
        self.df = df

        if 'SalePrice' in self.df:
            print("Removing SalePrice target before operations")
            self.df = self.df.drop(columns=["SalePrice"])

        # Instead of looping through all methods
        # explicitly order operations
        ops = [
            self._remove_columns,
            self._fix_na,
            self._convert_categorical,
            self._fix_error_values,
            self._fix_outliers,
            self._remove_non_numeric_columns,
            self._standardize_values
        ]

        for op in ops:
            starting_record_count = len(self.df)
            op()
            pct_record_reduction = 1 - (len(self.df) / starting_record_count)
            if pct_record_reduction > 0.0:
                print(f'{op.__name__} cut {pct_record_reduction*100:.2f}% of records')

        return self.df

    def _remove_columns(self):
        columns = getattr(self, 'remove_columns_list', [])
        self.df = self.df.drop(columns=columns)

    def _fix_na(self):
        na_map = getattr(self, "fix_na", {})

        drop_na_cols    = []
        zero_na_cols    = []

        for col, op in na_map.items():

            if op == "drop":
                drop_na_cols.append(col)
            elif op == "zero":
                zero_na_cols.append(col)
            elif op == "median":
                median_value = self.df[col].median()
                self.df[col] = self.df[col].fillna(median_value)
            elif op == "bottom":
                bottom_cap_value = self.df[col].quantile(.02)
                self.df[col] = self.df[col].fillna(bottom_cap_value)
            elif op == "top":
                top_cap_value = self.df[col].quantile(.98)
                self.df[col] = self.df[col].fillna(top_cap_value)
            else:
                drop_na_cols.append(col)

        # These operations are ineffectual if col list is empty, so not dangerous
        self.df = self.df.drop(columns=drop_na_cols)
        self.df.loc[:, zero_na_cols] = self.df.loc[:, zero_na_cols].fillna(0)

        # TODO REMOVE THIS ONCE ALL COLS ADDRESSED
        self.df = self.df.fillna(0)

    def _fix_error_values(self):
        should_fix_error_values = getattr(self, 'fix_error_values', False)
        if should_fix_error_values:
            pass
        pass

    def _remove_non_numeric_columns(self):
        should_remove_non_numeric = getattr(self, 'remove_non_numeric', False)
        if should_remove_non_numeric:
            self.df = self.df._get_numeric_data()
    
    def _convert_categorical(self):
        conversion_map = getattr(self, "convert_categorical", {})
        ordinal_cols = []
        truncated_ordinal_cols = []
        onehot_cols  = []
        for col, op in conversion_map.items():
            if op == "ordinal":
                ordinal_cols.append(col)
            elif op == "truncated_ordinal":
                truncated_ordinal_cols.append(col)
            else:
                onehot_cols.append(col)
        
        if len(ordinal_cols) > 0:
            self.df = Helpers.convert_to_ordinal(self.df, ordinal_cols)

        if len(truncated_ordinal_cols) > 0:
            self.df = Helpers.convert_to_truncated_ordinal(self.df, truncated_ordinal_cols)

        if len(onehot_cols) > 0:
            self.df = Helpers.convert_to_one_hot(self.df, onehot_cols)

    def _standardize_values(self):
        columns = getattr(self, 'standardize_cols', [])
        if len(columns)>0:
            subset_df = self.df.loc[:, columns]
            self.df.loc[:, columns] = (subset_df - subset_df.mean()) / subset_df.std()

    def _fix_outliers(self, thresholds=(.02, .98)):
        outlier_map = getattr(self, "cap_outliers", {})

        for col, op in outlier_map.items():
            top_cap_value    = self.df[col].quantile(thresholds[1])
            bottom_cap_value = self.df[col].quantile(thresholds[0])

            if op == "bottom_cap":
                self.df.loc[self.df[col] < bottom_cap_value, col] = bottom_cap_value
            elif op == "top_cap":
                self.df.loc[self.df[col] > top_cap_value, col] = top_cap_value
            elif op == "drop":
                self.df.loc[:, col] = self.df.loc[ 
                    (self.df[col]>=bottom_cap_value) & (self.df[col]<=top_cap_value), 
                    col]
            else:
                self.df.loc[self.df[col] > top_cap_value, col]    = top_cap_value
                self.df.loc[self.df[col] < bottom_cap_value, col] = bottom_cap_value


In [6]:
class FeatureGeneration:

    def __init__(self, list_of_methods):
        self.feature_methods = list_of_methods

    def generate(self, df):
        if 'SalePrice' in df:
            print("Removing SalePrice target before operations")
            df = df.drop(columns=["SalePrice"])
        for method in self.feature_methods:
            if callable(method):
                df = method(df)
        return df

In [7]:
class FeatureScaler:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def scale(self, df):
        self.df = df
        if len(self.df) == 0:
            return False

        if 'SalePrice' in self.df:
            print("Removing SalePrice target before operations")
            self.df = self.df.drop(columns=["SalePrice"])

        if not self.columns_are_valid():
            raise Exception("Ensure all columns are numeric")
        for attr_name in dir(self):
            if attr_name[0] == '_' and attr_name[1] != '_':
                attr = getattr(self, attr_name)
                if callable(attr):
                    attr()

        return self.df

    def columns_are_valid(self):
        # ensure that all dtypes are numeric
        dtypes = self.df.dtypes.unique()
        if 'O' in dtypes:
            print('found an o dtype')
            return False
        return True

    def _standardize_data(self):
        should_standardize_features = getattr(self, 'standardize_data', False)
        if should_standardize_features:
            self.df = (self.df - self.df.mean()) / self.df.std()


In [512]:
class Modeler:

    def __init__(self, use_random_forest=True):
        self.is_random_forest = use_random_forest
        if use_random_forest:
            self.clf = RandomForestRegressor(random_state=711)
        else:
            self.clf = tree.DecisionTreeRegressor(random_state=711)
        

    def fit(self, x, y):
        self.clf.fit(x, y)

    def predict(self, x):
        return self.clf.predict(x)

    def calc_error(self, predicted, actual):
        return np.sqrt( np.mean((predicted - actual)**2) )

    def evaluate(self, df):
        rmses = self.train(df, rounds=1)
        return np.mean(rmses)


In [526]:
class Pipeline:
    def __init__(self, df, fixing_options, gen_options, scaling_options, modeling_options):
        self.df = df
        self.options = {'fix': fixing_options, 'gen': gen_options, 'scale': scaling_options, 'model': modeling_options}
        # Used for Graphing
        self.clf      = None
        self.y_pred   = []
        self.y_actual = []
    
    def validate(self):
        return True

    def graph_predictions(self):
        if len(self.y_pred)>0 and len(self.y_pred) == len(self.y_actual):
            fig = go.Figure()
            fig.add_trace(
                go.Scatter(y=self.y_pred, name="prediction")
            )
            fig.add_trace(
                go.Scatter(y=self.y_actual, name="actual")
            )
            return fig

    def plot_tree(self):
        if self.options['model']['use_random_forest'] == False:
            if len(self.y_pred)>0: # Another way of checking that the CLF has been fit
                fig = plt.figure(figsize=(96,12))
                _ = tree.plot_tree(
                    self.clf, 
                    max_depth=6,
                    feature_names=self.df.columns,
                    filled=True,
                    fontsize=10,
                )
                fig.savefig("decision_tree.png")
                return fig

    def plot_feature_importances(self):
        features          = self.df.drop(columns         =['SalePrice']).columns
        importance_values = self.clf.feature_importances_
        feature_importance_map = {k:v for k,v in zip(features, importance_values)}

        sorted_feature_importances = [(t[0], t[1]) for t in sorted(feature_importance_map.items(), key=lambda kv:kv[1], reverse=True)]
        cols = [v[0] for v in sorted_feature_importances]
        vals = [v[1] for v in sorted_feature_importances]

        return px.bar(x=cols, y=vals)

    def run(self):
        fixer = DataFixing(**self.options['fix'])
        feature_generator = FeatureGeneration(self.options['gen'])
        scaler = FeatureScaler(**self.options['scale'])
        modeler = Modeler(**self.options['model'])

        # Perform Data Operations and Feature Generation
        data = self.df.drop(columns=['SalePrice'])
        data = fixer.fix(data)
        data = feature_generator.generate(data)
        data = scaler.scale(data)

        self.df = data.join(self.df.SalePrice)

        # Split into train and test sets
        train, test = Helpers.separate_final_month(self.df)
        x_train, y_train = Helpers.split_x_y(train)
        x_test, y_test = Helpers.split_x_y(test)


        # Model
        modeler.fit(x_train, y_train.flatten())
        y_pred = modeler.predict(x_test)
        error = modeler.calc_error(y_pred, y_test.flatten())

        self.y_pred = y_pred
        self.y_actual = y_test.flatten()
        self.clf = modeler.clf

        print(f'RMSE: {error}')
        return error

# Run

In [555]:
high_score = 24056.51147393528
print(f'High Score: RMSE {high_score}')

High Score: RMSE 24056.51147393528


In [554]:
data_fixing_options = {
    'remove_columns_list': [
        'Id',
        'LowQualFinSF',
    ],
    'fix_na': {
        # drop, zero, median, top, bottom
        'LotFrontage': 'bottom',
        'GarageYrBlt': 'median',
        'GarageType': 'zero',
        'Fence': 'zero',
    },
    'standardize_cols': [
        'LotArea',
        '1stFlrSF',
        '2ndFlrSF',
        'GrLivArea',
    ],
    'fix_error_values': False,
    'remove_non_numeric': True,
    'convert_categorical':{
        # ordinal, truncated_ordinal, one_hot
        'Alley'        : 'truncated_ordinal',
        'BldgType'     : 'ordinal',
        'HouseStyle'   : 'ordinal', # consider one hot
        'Street'       : 'truncated_ordinal',
        'Heating'      : 'truncated_ordinal',
        'HeatingQC'    : 'ordinal',
        'CentralAir'   : 'ordinal',
        'Electrical'   : 'ordinal',
        'Utilities'    : 'truncated_ordinal',
        'PoolQC'       : 'truncated_ordinal',
        'Condition1'   : 'truncated_ordinal',
        'MiscFeature'  : 'ordinal',
        'RoofMatl'     : 'truncated_ordinal',
        'RoofStyle'    : 'truncated_ordinal',
        'Exterior1st'  : 'truncated_ordinal',
        'Exterior2nd'  : 'truncated_ordinal',
        'ExterQual'    : 'ordinal',
        'ExterCond'    : 'ordinal',
        'Foundation'   : 'ordinal',
        'BsmtQual'     : 'ordinal',
        'BsmtCond'     : 'ordinal',
        'BsmtExposure' : 'ordinal',
        'BsmtFinType1' : 'ordinal',
        'MSSubClass'   : 'ordinal',
        'MSZoning'     : 'truncated_ordinal',
        'LandContour'  : 'truncated_ordinal',
        'LotShape'     : 'truncated_ordinal',
        'LotConfig'    : 'truncated_ordinal',
        'LandSlope'    : 'truncated_ordinal',
        'Neighborhood' : 'ordinal', # consider one hot
        'KitchenQual'  : 'ordinal',
        'Functional'   : 'ordinal', # consider truncated
        'FireplaceQu'  : 'ordinal',
        'GarageType'   : 'ordinal', # consider truncated
        'GarageFinish' : 'ordinal',
        'GarageQual'   : 'ordinal', # consider truncated
        'GarageCond'   : 'ordinal', # consider truncated
        'PavedDrive'   : 'ordinal',
        'Fence'        : 'ordinal',
        'SaleType'     : 'truncated_ordinal',
        'SaleCondition': 'truncated_ordinal',
    },
    'cap_outliers':{
        # top_cap, bottom_cap, cap, drop
        'LotFrontage': 'top_cap',
        'LotArea': 'top_cap',
        'TotalBsmtSF': 'top_cap',
        '1stFlrSF': 'top_cap',
        '2ndFlrSF': 'top_cap',
        'GrLivArea': 'top_cap',
    }
}

pipe = Pipeline(df.loc[(df.YrSold < 2010)], data_fixing_options, feature_engineering_methods, data_scaling_options, modeling_options)
rmse = pipe.run()
if rmse < high_score:
    print(f"Wo0t! New High Score: {rmse}")

RMSE: 24056.51147393528


In [537]:
pipe.plot_feature_importances()

In [553]:
pipe.df.head().T

Unnamed: 0,0,1,2,3,4
MSSubClass,1.0,2.0,1.0,3.0,1.0
LotFrontage,65.0,80.0,68.0,60.0,84.0
LotArea,-0.340204,-0.069832,0.318093,-0.081587,1.025761
Street,1.0,1.0,1.0,1.0,1.0
Alley,1.0,1.0,1.0,1.0,1.0
LandContour,1.0,1.0,1.0,1.0,1.0
Utilities,1.0,1.0,1.0,1.0,1.0
Condition2,1.0,1.0,1.0,1.0,1.0
OverallQual,7.0,6.0,7.0,7.0,8.0
OverallCond,5.0,8.0,5.0,5.0,5.0


In [533]:
pipe.graph_predictions()

In [538]:
feature_engineering_methods = [
    NewFeatures.add_month_number
]

data_scaling_options = {
    'standardize_data':False, 
}

modeling_options = {
    'use_random_forest': True,
}