# Imports & Config

In [1]:
import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from dotenv import dotenv_values

from sklearn import preprocessing
from sklearn import tree

config = dotenv_values(".env")

px.set_mapbox_access_token(config['MAPBOX_TOKEN'])
pio.templates.default = "plotly_white"

In [2]:
df = pd.read_csv('train.csv')

In [3]:
class NewFeatures:

    # Adds the month number, e.g. second month of seconds year = (1 * 12) + 2 = 14
    @staticmethod
    def add_month_number(df):
        '''
        Returns a new DataFrame with addded column "month_number". The month_number starts at 1 and increments to the final month.
        For instance, if the earliest date is January 2005, February 2005 = 2 and February 2006 = 14.
        '''
        df.YrSold = pd.to_numeric(df.YrSold)
        df.MoSold = pd.to_numeric(df.MoSold)
        base_year = df.YrSold.min()
        df['month_number'] = (df.YrSold - base_year)*12 + df.MoSold

        return df

In [4]:
class Helpers:

    # Returns two DataFrames, a training set with all months n-1, and a test set with just month n where the target var, SalePrice is stripped
    @staticmethod
    def separate_final_month(df):
        '''
        Returns two new DataFrames. The first DataFrame includes all records & columns except records from the final month (as determined by col month_number). 
        The second DataFrame includes all records & columns from the final month, except the target feature, SalePrice.
        '''
        if 'month_number' not in df:
            df = df.NewFeatures.add_month_number(df)
        
        train_set = df.loc[df.month_number<df.month_number.max()]
        test_set = df.loc[df.month_number==df.month_number.max()].drop(columns=['SalePrice'])

        return train_set, test_set

**Pipeline**:

1. Set Pipeline Options:
- What Data Fixing Operations should be used (bool flags)
- What new features (via methods) should be added
- Scaling options to use (bool flags)
- Model hyperparams

2. Data Fixing
- Remove inessential columns
- Fix NA/missing values
- Fix erroneous values (e.g. "None" vs None)
- Remove outliers *with care*

3. Feature Engineering
- Add new features

4. Feature Scaling
- Convert categorical features
- (optional) Scale/Normalize features
- Standardize features

5. Model preparation
- split train/test (use k-folds)
- set hyper-params (or use grid/random search)
- set metrics and visual analysis

In [12]:
class DataFixing:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def fix(self, df):
        # Loops through all class methods and calls the ones with prefix "_"
        self.df = df
        for attr_name in dir(self):
            if attr_name[0] == '_' and attr_name[1] != '_':
                attr = getattr(self, attr_name)
                if callable(attr):
                    print(f'before {attr_name}: {len(self.df)}')
                    attr()
                    print(f'after {attr_name}: {len(self.df)}')

        return self.df

    def _remove_columns(self, columns=None):
        should_remove_columns = getattr(self, 'remove_columns', False)
        if should_remove_columns:
            columns = getattr(self, 'remove_columns_list', columns)
            self.df = self.df.drop(columns=columns)

    def _fix_na(self):
        should_fix_na = getattr(self, 'fix_na', False)
        if should_fix_na:
            self.df = self.df.fillna(0)
        pass

    def _fix_error_values(self):
        should_fix_error_values = getattr(self, 'fix_error_values', False)
        if should_fix_error_values:
            pass
        pass

    def _fix_outliers(self):
        should_fix_outliers = getattr(self, 'fix_outliers', False)
        if should_fix_outliers:
            pass
        pass

    def _remove_non_numeric_columns(self):
        should_remove_non_numeric = getattr(self, 'remove_non_numeric', False)
        if should_remove_non_numeric:
            self.df = self.df._get_numeric_data()


In [13]:
class FeatureGeneration:

    def __init__(self, list_of_methods):
        self.feature_methods = list_of_methods

    def generate(self, df):
        for method in self.feature_methods:
            if callable(method):
                df = method(df)
        return df

In [14]:
class FeatureScaler:

    def __init__(self, **kwargs):
        self.df = None
        self.__dict__.update(kwargs)

    def scale(self, df):
        self.df = df
        if len(self.df) == 0:
            return False

        if not self.columns_are_valid():
            raise Exception("Ensure all columns are numeric")
        for attr_name in dir(self):
            if attr_name[0] == '_' and attr_name[1] != '_':
                attr = getattr(self, attr_name)
                if callable(attr):
                    attr()

        return self.df

    def columns_are_valid(self):
        # ensure that all dtypes are numeric
        dtypes = self.df.dtypes.unique()
        if 'O' in dtypes:
            print('found an o dtype')
            return False
        return True

    def _standardize_data(self):
        should_standardize_features = getattr(self, 'standardize_data', False)
        if should_standardize_features:
            self.df = (self.df - self.df.mean()) / self.df.std()


In [15]:
class Modeler:

    def __init__(self):
        self.clf = tree.DecisionTreeRegressor()

    def fit(self, x, y):
        self.clf.fit(x, y)

    def predict(self, x):
        return self.clf.predict(x)

    def calc_error(self, predicted, actual):
        return np.sqrt((predicted - actual)**2)

    def evaluate(self, df):
        rmses = self.train(df, rounds=1)
        return np.mean(rmses)

    def train(self, df, rounds=1):
        rmses = []
        for _ in range(rounds):
            # split into train, test datasets
            x = df.drop(columns=['SalePrice'])
            y_act = df.SalePrice

            # fit
            self.fit(x, y_act)

            # TODO: of course this should be on a test set
            y_pred = self.predict(x)

            # Measure
            rmse = self.calc_error(y_pred, y_act)
            rmses.append(rmse)

        return rmses


In [19]:
class Pipeline:
    def __init__(self, df, fixing_options, gen_options, scaling_options, modeling_options):
        self.df = df
        self.options = {'fix': fixing_options, 'gen': gen_options, 'scale': scaling_options, 'model': modeling_options}
    
    def validate(self):
        return True
    
    def run(self):
        fixer = DataFixing(**self.options['fix'])
        feature_generator = FeatureGeneration(self.options['gen'])
        scaler = FeatureScaler(**self.options['scale'])
        modeler = Modeler()

        self.df = fixer.fix(self.df)
        self.df = feature_generator.generate(self.df)
        self.df = scaler.scale(self.df)

        score = modeler.evaluate(self.df)

        print(f'Score: {score}')

In [20]:
data_fixing_options = {
    'remove_columns': True,
    'remove_columns_list': [
        'Street',
        'Alley',
        'Utilities',
        'Condition2',
        'PoolQC',
        'MiscFeature',
        'RoofMatl',
        'Heating',
        'LowQualFinSF',
        'PoolArea'
    ],

    'fix_na': True,
    'fix_error_values': False,
    'fix_outliers': False,
    'remove_non_numeric': True
}

feature_engineering_methods = [
    NewFeatures.add_month_number
]

data_scaling_options = {
    'standardize_data':True, 
}

modeling_options = {}

In [26]:
pipe = Pipeline(df, data_fixing_options, feature_engineering_methods, data_scaling_options, modeling_options)
pipe.run()

before _fix_error_values: 1460
after _fix_error_values: 1460
before _fix_na: 1460
after _fix_na: 1460
before _fix_outliers: 1460
after _fix_outliers: 1460
before _remove_columns: 1460
after _remove_columns: 1460
before _remove_non_numeric_columns: 1460
after _remove_non_numeric_columns: 1460
Score: 2.281280187585938e-19
