# Imports & Config

In [1]:
import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from dotenv import dotenv_values
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn import tree

config = dotenv_values(".env")

px.set_mapbox_access_token(config['MAPBOX_TOKEN'])
pio.templates.default = "plotly_white"

np.random.seed(711)

In [2]:
df = pd.read_csv('train.csv')

# Modeling

**Pipeline**:

1. Set Pipeline Options:
- What Data Fixing Operations should be used (bool flags)
- What new features (via methods) should be added
- Scaling options to use (bool flags)
- Model hyperparams

2. Data Fixing
- Remove inessential columns
- Fix NA/missing values
- Fix erroneous values (e.g. "None" vs None)
- Remove outliers *with care*

3. Feature Engineering
- Add new features

4. Feature Scaling
- Convert categorical features
- (optional) Scale/Normalize features
- Standardize features

5. Model preparation
- split train/test (use k-folds)
- set hyper-params (or use grid/random search)
- set metrics and visual analysis

## Feature Gen & Helpers

In [3]:
class NewFeatures:

    # Adds the month number, e.g. second month of seconds year = (1 * 12) + 2 = 14
    @staticmethod
    def add_month_number(df):
        '''
        Returns a new DataFrame with addded column "month_number". The month_number starts at 1 and increments to the final month.
        For instance, if the earliest date is January 2005, February 2005 = 2 and February 2006 = 14.
        '''
        df.YrSold = pd.to_numeric(df.YrSold)
        df.MoSold = pd.to_numeric(df.MoSold)
        base_year = df.YrSold.min()
        df['month_number'] = (df.YrSold - base_year)*12 + df.MoSold

        return df

In [4]:
class Helpers:

    # Returns two DataFrames, a training set with all months n-1, and a test set with just month n
    @staticmethod
    def separate_final_month(df):
        '''
        Returns two new DataFrames. The first DataFrame includes all records & columns except records from the final month (as determined by col month_number). 
        The second DataFrame includes all records & columns from the final month.
        '''
        df = df.copy()
        if 'month_number' not in df:
            df = NewFeatures.add_month_number(df)
        
        train_set = df.loc[df.month_number<df.month_number.max()]
        test_set = df.loc[df.month_number==df.month_number.max()]

        return train_set, test_set

    @staticmethod
    def split_x_y(df):
        x = df.drop(columns=['SalePrice']).values
        y = df.SalePrice.values.reshape((len(df), 1))
        return x, y

    @staticmethod
    def convert_to_ordinal(df, columns, ignore_na=False):
        df = df.copy()
        for col in columns:
            unique_values = df[col].unique()
            n_categories  = len(unique_values)

            ordinal_map = {v:i for v, i in zip(unique_values, range(1, n_categories+1))}
            
            df[col] = df[col].replace(ordinal_map)
            if not ignore_na:
                df[col] = df[col].fillna(0)

        return df

    @staticmethod
    def convert_to_one_hot(df, columns, ignore_na=False, drop_original_cols=True):
        df = df.copy()
        if columns is None:
            columns = df.columns
        onehot_cols = pd.get_dummies(df.loc[:, columns], columns=columns, dummy_na=(not ignore_na))
        print(onehot_cols.columns)
        df = df.join(onehot_cols)
        if drop_original_cols:
            df = df.drop(columns=columns)

        return df


## Pipeline

In [5]:
class DataFixing:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def fix(self, df):
        # Loops through all class methods and calls the ones with prefix "_"
        self.df = df

        if 'SalePrice' in self.df:
            print("Removing SalePrice target before operations")
            self.df = self.df.drop(columns=["SalePrice"])

        for attr_name in dir(self):
            if attr_name[0] == '_' and attr_name[1] != '_':
                attr = getattr(self, attr_name)
                if callable(attr):
                    print(f'before {attr_name}: {len(self.df)}')
                    attr()
                    print(f'after {attr_name}: {len(self.df)}')

        return self.df

    def _remove_columns(self, columns=None):
        should_remove_columns = getattr(self, 'remove_columns', False)
        if should_remove_columns:
            columns = getattr(self, 'remove_columns_list', columns)
            self.df = self.df.drop(columns=columns)

    def _fix_na(self):
        should_fix_na = getattr(self, 'fix_na', False)
        if should_fix_na:
            self.df = self.df.fillna(0)
        pass

    def _fix_error_values(self):
        should_fix_error_values = getattr(self, 'fix_error_values', False)
        if should_fix_error_values:
            pass
        pass

    def _fix_outliers(self):
        should_fix_outliers = getattr(self, 'fix_outliers', False)
        if should_fix_outliers:
            pass
        pass

    def _remove_non_numeric_columns(self):
        should_remove_non_numeric = getattr(self, 'remove_non_numeric', False)
        if should_remove_non_numeric:
            self.df = self.df._get_numeric_data()


In [6]:
class FeatureGeneration:

    def __init__(self, list_of_methods):
        self.feature_methods = list_of_methods

    def generate(self, df):
        if 'SalePrice' in df:
            print("Removing SalePrice target before operations")
            df = df.drop(columns=["SalePrice"])
        for method in self.feature_methods:
            if callable(method):
                df = method(df)
        return df

In [7]:
class FeatureScaler:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def scale(self, df):
        self.df = df
        if len(self.df) == 0:
            return False

        if 'SalePrice' in self.df:
            print("Removing SalePrice target before operations")
            self.df = self.df.drop(columns=["SalePrice"])

        if not self.columns_are_valid():
            raise Exception("Ensure all columns are numeric")
        for attr_name in dir(self):
            if attr_name[0] == '_' and attr_name[1] != '_':
                attr = getattr(self, attr_name)
                if callable(attr):
                    attr()

        return self.df

    def columns_are_valid(self):
        # ensure that all dtypes are numeric
        dtypes = self.df.dtypes.unique()
        if 'O' in dtypes:
            print('found an o dtype')
            return False
        return True

    def _standardize_data(self):
        should_standardize_features = getattr(self, 'standardize_data', False)
        if should_standardize_features:
            self.df = (self.df - self.df.mean()) / self.df.std()


In [8]:
class Modeler:

    def __init__(self):
        self.clf = tree.DecisionTreeRegressor()

    def fit(self, x, y):
        self.clf.fit(x, y)

    def predict(self, x):
        return self.clf.predict(x)

    def calc_error(self, predicted, actual):
        return np.sqrt( np.mean((predicted - actual)**2) )

    def evaluate(self, df):
        rmses = self.train(df, rounds=1)
        return np.mean(rmses)


In [9]:
class Pipeline:
    def __init__(self, df, fixing_options, gen_options, scaling_options, modeling_options):
        self.df = df
        self.options = {'fix': fixing_options, 'gen': gen_options, 'scale': scaling_options, 'model': modeling_options}
        # Used for Graphing
        self.clf      = None
        self.y_pred   = []
        self.y_actual = []
    
    def validate(self):
        return True

    def graph_predictions(self):
        if len(self.y_pred)>0 and len(self.y_pred) == len(self.y_actual):
            fig = go.Figure()
            fig.add_trace(
                go.Scatter(y=self.y_pred, name="prediction")
            )
            fig.add_trace(
                go.Scatter(y=self.y_actual, name="actual")
            )
            return fig

    def plot_tree(self):
        if len(self.y_pred)>0: # Another way of checking that the CLF has been fit
            fig = plt.figure(figsize=(96,12))
            _ = tree.plot_tree(
                self.clf, 
                max_depth=6,
                feature_names=self.df.columns,
                filled=True,
                fontsize=10,
            )
            fig.savefig("decision_tree.png")
            return fig

    def run(self):
        fixer = DataFixing(**self.options['fix'])
        feature_generator = FeatureGeneration(self.options['gen'])
        scaler = FeatureScaler(**self.options['scale'])
        modeler = Modeler()

        # Perform Data Operations and Feature Generation
        data = self.df.drop(columns=['SalePrice'])
        data = fixer.fix(data)
        data = feature_generator.generate(data)
        data = scaler.scale(data)

        # Split into train and test sets
        train, test = Helpers.separate_final_month(data.join(df.SalePrice))
        x_train, y_train = Helpers.split_x_y(train)
        x_test, y_test = Helpers.split_x_y(test)


        # Model
        modeler.fit(x_train, y_train)
        y_pred = modeler.predict(x_test)
        error = modeler.calc_error(y_pred, y_test.flatten())

        self.y_pred = y_pred
        self.y_actual = y_test.flatten()
        self.clf = modeler.clf

        print(f'RMSE: {error}')
        return error

# Run

In [10]:
data_fixing_options = {
    'remove_columns': False,
    'remove_columns_list': [
        'Street',
        'Alley',
        'Utilities',
        'Condition2',
        'PoolQC',
        'MiscFeature',
        'RoofMatl',
        'Heating',
        'LowQualFinSF',
        'PoolArea'
    ],

    'fix_na': True,
    'fix_error_values': False,
    'fix_outliers': False,
    'remove_non_numeric': True
}

feature_engineering_methods = [
    NewFeatures.add_month_number
]

data_scaling_options = {
    'standardize_data':False, 
}

modeling_options = {}

In [11]:
pipe = Pipeline(df.loc[(df.YrSold < 2010)], data_fixing_options, feature_engineering_methods, data_scaling_options, modeling_options)
pipe.run()

before _fix_error_values: 1285
after _fix_error_values: 1285
before _fix_na: 1285
after _fix_na: 1285
before _fix_outliers: 1285
after _fix_outliers: 1285
before _remove_columns: 1285
after _remove_columns: 1285
before _remove_non_numeric_columns: 1285
after _remove_non_numeric_columns: 1285
RMSE: 40680.75549445954


40680.75549445954

In [12]:
pipe.graph_predictions()