In [None]:
# library imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# custom modules
from custom_modules import custom_plot, custom_encoder

In [None]:
class DataPreProcessor:
    '''
    Class with methods to impute, encode, and/or transform methods used to 
    pre-process data 
    '''
    def __init__(self, dataframe, target):
        self.df = dataframe # original unchanged df used for fitting encoders, etc.
        self.X  = dataframe.drop(target, axis=1)
        self.y  = dataframe[target]
        self.features  = self.X.columns
        
        # Remove rows with missing target, separate target from predictors
        missing = self.y[self.y.isnull()].index.tolist()     
        self.df.drop(missing, inplace=True)
        self.X.drop(missing, inplace=True)
        self.y.drop(missing, inplace=True)
        
        
    def log_transform_target(self):        
        self.y = np.log1p(self.y)
        self.df[self.y.name] = self.y
    
    def drop_missing(self, threshold=0.85,  X=None, verbose=False):
        '''
        Drop columns missing more than *threshold*% of its values
        @param threshold: (float)
        '''
        X = self.X if X is None else X
        
        # ------------------------------------------------------------------------------------------ #
        # For the Kaggle Housing data, some features are intended to have 'NA' as a category, 
        # so change the pd.NA for those to 'NA' where necessary
        na_allowed = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                       'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

        for feature in na_allowed:
            self.df[feature].fillna('NA', inplace=True)
            X[feature].fillna('NA', inplace=True)
            
        # ------------------------------------------------------------------------------------------ #
        # Number/proportion of missing values in each column of training data
        missing_val_count_by_column = (self.df.isnull().sum())
        missing_val_prop_by_column  = (self.df.isnull().mean())

        # Drop columns missing more than n% of values
        to_drop = missing_val_prop_by_column[missing_val_prop_by_column > threshold].index.values
        
        X.drop(to_drop, axis=1, inplace=True)
        self.features = X.columns
        if verbose: 
            print(f"TOTALS:\n{missing_val_count_by_column[missing_val_count_by_column > 0]}\n")
            print(f"PROPORTIONS:\n{missing_val_prop_by_column[missing_val_prop_by_column > 0]}\n")
            print(f"DROPPED: {to_drop}\n")
                  
    def impute(self, X=None):
        '''
        Impute missing values 
        
        @param 
        '''
        X = self.X if X is None else X
        
        for feature in self.features:
            if X[feature].dtype == "object":
                # fill categorical columns with most frequent value
                X[feature].fillna( self.df[feature].mode().iloc[0], inplace=True )
            else:
                # filling missing values with medians of the columns
                X[feature].fillna( self.df[feature].median(), inplace=True )
    
    def encode(self, encoder, args, col, X=None):
        '''
        Encode the categorical variables in the dataset using the encoder passed in
        
        @param encoder: type of encoder to use
        @param args: list of arguments used to fit & transform the column
        @return: 
        '''
        X = self.X if X is None else X
        
        encoder.fit(*args)
        X[col] = encoder.transform(X[col])
    
    def feature_engineering(self, X=None):
        '''
        dataset specific feature engineering
        '''
        X = self.X if X is None else X
        
        X['TotalBath'] = X['BsmtFullBath'] + X['FullBath'] + ((X['BsmtHalfBath']  + X['HalfBath']) / 2)
        X.drop(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1, inplace=True)
        
        # Age = (Most Recent Yr in All Data) - (Yr House was Last Remod.)
        X['Age'] = X['YearRemodAdd'].max() - X['YearRemodAdd']
        
        X['ExterAggr'] = X['ExterQual'] + X['ExterCond']
        X.drop(['ExterQual', 'ExterCond'], axis=1, inplace=True)
        
        X['BsmtAggr'] = X['BsmtQual'] + X['BsmtCond'] + X['BsmtExposure'] + ((X['BsmtFinType1'] + X['BsmtFinType2']) / 2)
        X.drop(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'], axis=1, inplace=True)
        
        X['GarageAggr'] = X['GarageFinish'] + X['GarageQual'] + X['GarageCond']
        X.drop(['GarageFinish', 'GarageQual', 'GarageCond'], axis=1, inplace=True)
        
        X['TotalSF'] = X['TotalBsmtSF'] + X['GrLivArea']
        X.drop(['TotalBsmtSF', 'GrLivArea', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
        
        self.features = X.columns
    

In [None]:
def score_dataset(X_train, X_valid, y_train, y_valid, model, err_func):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return err_func(y_valid, preds)

def score_log_dataset(X_train, X_valid, y_train, y_valid, model, err_func):
    model.fit(X_train, y_train)
    preds   = np.expm1(model.predict(X_valid))
    y_valid = np.expm1(y_valid)
    return err_func(y_valid, preds)

def feature_importances(model, columns):
    feat_imp = { columns[i]: model.feature_importances_[i] for i in range(len(columns)) }
    return dict(sorted(feat_imp.items(), key=lambda item: item[1], reverse=True))


###### Read Data

In [None]:
# Read the data
df_train = pd.read_csv('data/train.csv', index_col="Id")
df_test = pd.read_csv('data/test.csv', index_col="Id")

###### Distribution of Target

In [None]:
# Sale Price Distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))

ax1.hist(df_train.SalePrice, bins=100)
ax1.set_xlabel("SalePrice")
ax1.set_title("Sale Price Histogram")

ax2.hist(np.log1p(df_train.SalePrice), bins=100)
ax2.set_xlabel("log(SalePrice)")
ax2.set_title("log(Sale Price) Histogram")

plt.show()

###### Preprocess Data & EDA

In [None]:
encoder_map = { 
    'MSSubClass': 'RankLabelEncoder',
    'MSZoning': 'RankLabelEncoder',
    'Street': 'RankLabelEncoder',
    'Alley': 'RankLabelEncoder',
    'LotShape': 'custom',
    'LandContour': 'RankLabelEncoder',
    'Utilities': 'custom',
    'LotConfig': 'RankLabelEncoder',
    'LandSlope': 'RankLabelEncoder',
    'Neighborhood': 'RankLabelEncoder',
    'Condition1': 'RankLabelEncoder',
    'Condition2': 'RankLabelEncoder',
    'BldgType': 'RankLabelEncoder',
    'HouseStyle': 'RankLabelEncoder',
    'RoofStyle': 'RankLabelEncoder',
    'RoofMatl': 'RankLabelEncoder',
    'Exterior1st': 'RankLabelEncoder',
    'Exterior2nd': 'RankLabelEncoder',
    'MasVnrType': 'RankLabelEncoder',
    'ExterQual': 'custom',
    'ExterCond': 'custom',
    'Foundation': 'RankLabelEncoder',
    'BsmtQual': 'custom',
    'BsmtCond': 'custom',
    'BsmtExposure': 'custom',
    'BsmtFinType1': 'custom',
    'BsmtFinType2': 'custom',
    'Heating': 'RankLabelEncoder',
    'HeatingQC': 'custom',
    'CentralAir': 'custom',
    'Electrical': 'custom',
    'KitchenQual': 'custom',
    'Functional': 'custom',
    'FireplaceQu': 'custom',
    'GarageType': 'RankLabelEncoder',
    'GarageFinish': 'custom',
    'GarageQual': 'custom',
    'GarageCond': 'custom',
    'PavedDrive': 'custom',
    'PoolQC': 'custom',
    'Fence': 'custom',
    'MiscFeature': 'RankLabelEncoder',
    'SaleType': 'RankLabelEncoder',
    'SaleCondition': 'RankLabelEncoder',
}

# order for features where the order can be inferred by the categories
custom_map = {
    'LotShape': {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3},
    'Utilities': {'AllPub': 0, 'NoSewr': 1, 'NoSeWa': 2, 'ELO': 3},
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtExposure': {'NA': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtFinType1': {'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'CentralAir': {'N': 0, 'Y': 1},
    'Electrical': {'FuseP': 0, 'FuseF': 1, 'Mix': 2, 'FuseA': 3, 'SBrkr': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3,'Mod': 4, 'Min2': 5, 'Min1': 6,'Typ': 7},
    'FireplaceQu': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'PoolQC': {'NA': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Fence': {'NA': 0, 'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4}
}

In [None]:
train = df_train.copy()
preprocess = DataPreProcessor(train, 'SalePrice')

preprocess.drop_missing(threshold=1, verbose=True)
preprocess.impute()
preprocess.log_transform_target()

# --------------------------------- Plot Data Distributions -------------------------------- #
print('---------------------------------------\n\nINTERACTIVE PLOTS OF DATA DISTRIBUTIONS (pre-encoding)')
preprocess_df = pd.concat([preprocess.X, preprocess.y], axis=1)
custom_plot.interactive_distributions(preprocess_df, preprocess.y.name);
# ------------------------------------------------------------------------------------------ #

rle = custom_encoder.RankLabelEncoder()
for col,encoder in encoder_map.items():
    if encoder == 'RankLabelEncoder':
        preprocess.encode( rle, [preprocess.df[col], preprocess.df[preprocess.y.name]], col )
    else: # == 'custom' 
        preprocess.X[col].replace( custom_map[col], inplace=True )

# preprocess.feature_engineering()
   
X, y  = preprocess.X, preprocess.y   
train = pd.concat([X, y], axis=1)

# --------------------------------- Plot Data Distributions -------------------------------- #
print('---------------------------------------\n\nINTERACTIVE PLOTS OF DATA DISTRIBUTIONS (post-encoding)')
preprocess_df = pd.concat([preprocess.X, preprocess.y], axis=1)
custom_plot.interactive_distributions(preprocess_df, preprocess.y.name);
# ------------------------------------------------------------------------------------------ #


In [None]:
custom_plot.interactive_heatmap(train, 'SalePrice');

###### Model Testing

In [None]:
# Base Models
rs = 42

base_models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=rs),
    'Random Forest': RandomForestRegressor(random_state=rs),
    'Gradient Boost': GradientBoostingRegressor(random_state=rs),
}

In [None]:
# train/validation splits
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=rs)

# --------------------------------------- TRAINING ERROR ----------------------------------------------------- #
data = {}
for model_name, model in base_models.items():
    # regular sale price predictions/score
    MAE = score_log_dataset(X_train, X_train, y_train, y_train, model, mean_absolute_error)
    MSE = score_log_dataset(X_train, X_train, y_train, y_train, model, mean_squared_error)

    data[model_name] = [MAE, MSE]

    
print('BASELINE TRAINING ERRORS:')
print(pd.DataFrame.from_dict(data, orient='index', columns=['MAE', 'MSE']))
print()
# ------------------------------------------------------------------------------------------------------------ #

# --------------------------------------- VALIDATION ERROR --------------------------------------------------- #
data = {}
for model_name, model in base_models.items():
    # regular sale price predictions/score
    MAE = score_log_dataset(X_train, X_valid, y_train, y_valid, model, mean_absolute_error)
    MSE = score_log_dataset(X_train, X_valid, y_train, y_valid, model, mean_squared_error)

    data[model_name] = [MAE, MSE]

    
print('BASELINE VALIDATION ERRORS:')
print(pd.DataFrame.from_dict(data, orient='index', columns=['MAE', 'MSE']))
# ------------------------------------------------------------------------------------------------------------ #


###### Predict on Test Data

In [None]:
# Process test data
X_test = df_test.copy()
preprocess.drop_missing(threshold=1, X=X_test, verbose=True)
preprocess.impute(X_test)

rle = custom_encoder.RankLabelEncoder()
for col,encoder in encoder_map.items():
    if encoder == 'RankLabelEncoder':
        preprocess.encode( rle, [preprocess.df[col], preprocess.df[preprocess.y.name]], col, X_test )
    else: # == 'custom' 
        X_test[col].replace( custom_map[col], inplace=True )

# preprocess.feature_engineering(X_test)

X_test

In [None]:
# Feature Selection (corr >= corr_thresholds)
threshold = 0.0 # not seeing any improvement from holding out columns, if anything it seems worse
# features = corr[abs(corr.SalePrice) >= threshold].index.values

model = base_models['Gradient Boost']
model.set_params(loss='ls', max_features='auto', n_estimators=250)
model.fit(X, y)

# preds = model.predict(X_test)
preds = np.expm1(model.predict(X_test))
preds

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds})
output.to_csv('submission2.csv', index=False)

# SCORE -> 0.12721 (1422/5721, top 25%)
# no feature engineering or dropping
#
# SCORE -> 0.12811
# w/ some feature engineering (no 'ExterAggr')

In [None]:
feature_importances(model, X_test.columns)


In [None]:
pip-compile