In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import seaborn as sns

%matplotlib inline
pd.options.display.max_columns = None

In [2]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-8.1.0-posix-seh-rt_v6-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb

In [3]:
# Function to get training and testing data

def get_data(train_data_path='data/train.csv', test_data_path='data/test.csv'):
    print("Grabbing train data from '{}'...".format(train_data_path))
    print("Grabbing test data from '{}'...".format(test_data_path))
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)
    print("Data successfully loaded.")
    return train_data, test_data

# returns train & test dataframes

In [4]:
# Function to remove any columns with >50% null values

def remove_null_columns(df, threshold=0.5, return_removed_cols=False):
    print("Removing columns that are >{}% null...".format(round(threshold*100)))
    percent_null = [sum(df[col].isnull()/len(df[col])) for col in df]
    null_series = pd.Series(percent_null)
    null_series.set_axis(df.columns, inplace=True)
    null_series.sort_values(ascending=False, inplace=True)
    cols_to_remove = null_series[null_series > threshold].index
    print("{} columns were removed.".format(len(cols_to_remove)))
    if not return_removed_cols:
        return df.drop(columns=cols_to_remove, axis=1)
    else:
        return df.drop(columns=cols_to_remove, axis=1), cols_to_remove
    
# returns dataframe (minus removed null columns)

In [5]:
# Function to return two lists of features:
# (a) list of numerical features
# (b) list of categorical features

def get_feature_lists(df):
    print("Obtaining feature column names...")
    numeric_cols = list(df.columns[df.dtypes != 'object'])
    all_cols = list(df.columns)
    false_numeric_cols = [
        'MSSubClass',
        'MoSold'
    ]
    non_feature_cols = [
        'Id',
        'SalePrice'
    ]
    
    true_numeric_cols = [col for col in numeric_cols if (col not in false_numeric_cols and col not in non_feature_cols)]
    categorical_cols = [col for col in all_cols if (col not in non_feature_cols and col not in true_numeric_cols)]
    
    print("{} numeric features obtained; {} categorical features obtained".format(len(true_numeric_cols),
                                                                                  len(categorical_cols)))
            
    return true_numeric_cols, categorical_cols

# returns lists of num & cat columns

In [6]:
# Function to return two dataframes:
# (a) dataframe of imputed numerical features (fill NaN values with mean)
# (b) dataframe of imputed categorical features (fill NaN values with mode)
# (c) generate dummy values for categorical fields

def impute_features(df_numerical, df_categorical):
    print("Imputing data...")
    imp_numerical = Imputer(strategy='mean')
    df_numerical_imputed = pd.DataFrame(imp_numerical.fit_transform(df_numerical), 
                                        columns=df_numerical.columns, 
                                        index=df_numerical.index)
    
    df_categorical.fillna(value='missing', inplace=True)
    reclassify_categories(df_categorical)
    df_categorical_imputed = pd.get_dummies(df_categorical)
    
    print("Imputation complete.\nThere are now {} numerical features and {} categorical features.".format(len(df_numerical_imputed.columns),
                                                                                                             len(df_categorical_imputed.columns)))
        
    return df_numerical_imputed, df_categorical_imputed

# returns imputed num & cat dataframes

In [7]:
# Function to filter for only features present in both training and testing sets

def prune_categorical_features(df_cat_train, df_cat_test):
    cat_train = df_cat_train.columns
    cat_test = df_cat_test.columns
    cat_common = [feature for feature in cat_train if feature in cat_test]
    return cat_common

In [8]:
# Feature engineering functions:
# change a year feature into an age feature

def get_year_features():
    return ['YrSold', 'GarageYrBlt', 'YearRemodAdd', 'YearBuilt']

def calculate_age(year):
    age = 2018 - year
    return age

def year_to_age(df):
    print("Feature engineering: transforming years to time since...")
    year_features = get_year_features()
    df[year_features] = df[year_features].copy().apply(calculate_age)
    return df

In [9]:
# Feature engineering functions:
# perform log transform on relevant numerical features

def get_log_features():
    return ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

def log_transform(x):
    return np.log(x + 1)
    
def log_xform_features(df):
    print("Feature engineering: log transform of relevant numerical features...")
    log_features = get_log_features()
    df[log_features] = df[log_features].copy().apply(log_transform)
    return df

In [10]:
# Feature engineering function:
# normalize numerical data to scale between 0 and 1

def normalize_numerical_features(df):
    print("Feature engineering: normalizing numerical data to scale between 0 and 1...")
    scaler = MinMaxScaler()
    df_transformed = scaler.fit_transform(df)
    df_normalized = pd.DataFrame(df_transformed, columns=df.columns, index=df.index)
    return df_normalized

In [11]:
# Feature engineering function:
# create meta numerical features

def build_meta_features(df):
    df['overallScore'] = df['OverallQual'] * df['OverallCond'] + 1
    df['BsmtIncompleteRatio'] = df['BsmtUnfSF'] / (df['BsmtFinSF1'] + df['BsmtFinSF2'] + 1)
    df['HouseSFScore'] = (df['TotalBsmtSF'] + 1) * (df['1stFlrSF'] + df['2ndFlrSF'] + 1)
    df['BathScore'] = (df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath']) ** 2
    df['GarageScore'] = df['GarageCars'] * df['GarageArea'] + 1
    df['PorchScore'] = (df['OpenPorchSF'] + 1) * (df['EnclosedPorch'] + 1) * (df['3SsnPorch'] + 1) * (df['ScreenPorch'] + 1)
    df['ageScore'] = df['YearBuilt'] * df['YearRemodAdd'] + 1
#     df['frontageRatio'] = df['LotFrontage'] / (df['LotArea'].apply(lambda x: np.sqrt(x)) + 1)
#     df['liveAreaRatio'] = (df['GrLivArea'] + 1) / (df['LotArea'] + 1)
    return df

In [12]:
# Function to call all feature engineering functions
def feature_engineering(df):
    print("Starting feature engineering...")
    df = year_to_age(df)
    df = build_meta_features(df)
    df = log_xform_features(df)
#     df = normalize_numerical_features(df)
    print("Feature engineering complete.")
    return df

In [13]:
# Function for feature engineering categorical values

def rename_value(data, current, new):
    data.replace(to_replace=current, value=new, inplace=True)
    return data

In [14]:
# Function for mapping categorical feature engineering

def reclassify_categories(df):
    
    # MSSubClass
    for val in [20, 30, 40, 120]:
        rename_value(df['MSSubClass'], val, '1story')
    for val in [45, 50, 150]:
        rename_value(df['MSSubClass'], val, '1halfstory')
    for val in [60, 70, 160, 190]:
        rename_value(df['MSSubClass'], val, '2story')
    for val in [80, 85, 90, 180]:
        rename_value(df['MSSubClass'], val, 'splitOrDuplex')
    rename_value(df, 75, '2halfstory')
    
    # MSZoning
    for val in ['RH', 'RL', 'RP', 'RM', 'FV']:
        rename_value(df['MSZoning'], val, 'Residential')
    for val in ['I', 'C', 'A']:
        rename_value(df['MSZoning'], val, 'Other')
        
    # LotShape
    for val in ['IR1', 'IR2', 'IR3']:
        rename_value(df['LotShape'], val, 'Irregular')
    
    # LotConfig
    for val in ['FR2', 'FR3']:
        rename_value(df['LotConfig'], val, 'Frontage')
        
    # LandSlope
    for val in ['Mod', 'Sev']:
        rename_value(df['LandSlope'], val, 'Sloped')
    
    # Condition1
    for val in ['RRAn', 'RRNn', 'RRNe', 'RRAe']:
        rename_value(df['Condition1'], val, 'Railroad')
    for val in ['Artery', 'Feedr']:
        rename_value(df['Condition1'], val, 'Adjacent')
    for val in ['PosA', 'PosN']:
        rename_value(df['Condition1'], val, 'Positive')
        
    # BldgType
    for val in ['2FmCon', 'Duplx']:
        rename_value(df, val, 'Half')
    for val in ['TwnhsE', 'TwnhsE']:
        rename_value(df, val, 'Townhouse')
        
    # HouseStyle
    for val in ['1.5Fin', '2Story', '2.5Fin', '2.5Unf']:
        rename_value(df['HouseStyle'], val, 'twoStory')
    for val in ['1Story', '1.5Unf']:
        rename_value(df['HouseStyle'], val, 'oneStory')
    for val in ['SFoyer', 'SLvl']:
        rename_value(df['HouseStyle'], val, 'Split')
        
    # RoofStyle
    for val in ['Flat', 'Gambrel', 'Mansard', 'Shed']:
        rename_value(df['RoofStyle'], val, 'Other')
        
    # Exterior1st
    for val in ['AsbShng', 'CBlock', 'CemntBd', 'HdBoard',
                'ImStucc', 'Plywood']:
        rename_value(df['Exterior1st'], val, 'Low')
    for val in ['AsphShn', 'BrkComm', 'BrkFace', 'MetalSd',
                'PreCast', 'Stucco', 'VinylSd']:
        rename_value(df['Exterior1st'], val, 'Medium')
    for val in ['Stone', 'Wd Sdng', 'WdShing']:
        rename_value(df['Exterior1st'], val, 'High')
    
    # Exterior2nd
    for val in ['AsbShng', 'CBlock', 'CemntBd', 'HdBoard',
                'ImStucc', 'Plywood']:
        rename_value(df['Exterior2nd'], val, 'Low')
    for val in ['AsphShn', 'BrkComm', 'BrkFace', 'MetalSd',
                'PreCast', 'Stucco', 'VinylSd']:
        rename_value(df['Exterior2nd'], val, 'Medium')
    for val in ['Stone', 'Wd Sdng', 'WdShing']:
        rename_value(df['Exterior2nd'], val, 'High')
        
    # ExterCond
    for val in ['Ex', 'Gd']:
        rename_value(df['ExterCond'], val, 'High')
    for val in ['Fa', 'Po']:
        rename_value(df['ExterCond'], val, 'Low')
    
    # Foundation
    for val in ['BrkTil', 'Stone']:
        rename_value(df['Foundation'], val, 'High')
    for val in ['Slab', 'Wood']:
        rename_value(df['Foundation'], val, 'Medium')
    for val in ['CBlock', 'PConc']:
        rename_value(df['Foundation'], val, 'Low')
        
    # BsmtQual
    for val in ['Ex', 'Gd']:
        rename_value(df['BsmtQual'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['BsmtQual'], val, 'Low')
        
    # BsmtCond
    for val in ['Ex', 'Gd']:
        rename_value(df['BsmtCond'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['BsmtCond'], val, 'Low')
        
    #BsmtExposure
    for val in ['Mn', 'No', 'NA']:
        rename_value(df['BsmtExposure'], val, 'Low')
        
    #BsmtFinType2
    for val in ['GLQ', 'ALQ']:
        rename_value(df['BsmtFinType2'], val, 'High')
    for val in ['BLQ', 'Rec', 'LwQ', 'NA']:
        rename_value(df['BsmtFinType2'], val, 'Low')
    
    # HeatingQC
    for val in ['Ex', 'Gd']:
        rename_value(df['HeatingQC'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['HeatingQC'], val, 'Low')
        
    # Electrical
    for val in ['SBrkr', 'FuseA']:
        rename_value(df['Electrical'], val, 'Standard')
    for val in ['FuseF', 'FuseP', 'Mix']:
        rename_value(df['Electrical'], val, 'NonStandard')
        
    # KitchenQual
    for val in ['Ex', 'Gd']:
        rename_value(df['KitchenQual'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['KitchenQual'], val, 'Low')
        
    # Functional
    for val in ['Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']:
        rename_value(df['Functional'], val, 'NonTyp')
        
    # FireplaceQu
    for val in ['Ex', 'Gd']:
        rename_value(df['FireplaceQu'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['FireplaceQu'], val, 'Low')
        
    # GarageType
    for val in ['Attchd', 'Basment', 'BuiltIn', '2Types']:
        rename_value(df['GarageType'], val, 'Attached')
    for val in ['CarPort', 'Detchd', 'NA']:
        rename_value(df['GarageType'], val, 'Detached')
        
    # GarageFinish
    for val in ['Unf', 'RFn', 'NA']:
        rename_value(df['GarageFinish'], val, 'Unfinished')
        
    # GarageQual
    for val in ['Ex', 'Gd']:
        rename_value(df['GarageQual'], val, 'High')
    for val in ['Fa', 'Po', 'NA']:
        rename_value(df['GarageQual'], val, 'Low')
        
    # PavedDrive
    for val in ['P', 'N']:
        rename_value(df['PavedDrive'], val, 'Unpaved')
        
    # Fence
#     for val in ['GdPrv', 'GdWo']:
#         rename_value(df['Fence'], val, 'High')
#     for val in ['MnPrv', 'MnWw', 'NA']:
#         rename_value(df['Fence'], val, 'Low')
        
    # MiscFeature
#     for val in ['Elev', 'Gar2', 'Othr', 'Shed', 'TenC', 'NA']:
#         rename_value(df['MiscFeature'], val, 'Yes')
        
    # MoSold
    for val in [1, 2, 3, 10, 11, 12]:
        rename_value(df['MoSold'], val, 'Off')
    for val in [4, 5, 6, 7, 8, 9]:
        rename_value(df['MoSold'], val, 'On')
        
    # SaleType
    for val in ['CWD', 'VWD', 'COD', 'Con', 'ConLw', 'ConLI',
                'ConLD', 'Oth']:
        rename_value(df['SaleType'], val, 'Unconventional')

    # SaleCondition
    for val in ['Normal', 'Alloca']:
        rename_value(df['SaleCondition'], val, 'NormalSale')
    for val in ['Abnormal', 'AdjLand']:
        rename_value(df['SaleCondition'], val, 'NonSale')
    for val in ['Family', 'Partial']:
        rename_value(df['SaleCondition'], val, 'LowerSale')
        
    # Neighborhood
#     for val in ['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'CollgCr', 'IDOTRR', 'MeadowV', 'Mitchel',
#                 'NAmes', 'NPkVill', 'OldTown', 'Sawyer', 'SawyerW']:
#         rename_value(df['Neighborhood'], val, 'low')
#     for val in ['Crawfor', 'Edwards', 'Gilbert', 'NWAmes', 'NoRidge', 'SWISU', 'Somerst', 'StoneBr',
#                 'Timber', 'Veenker']:
#         rename_value(df['Neighborhood'], val, 'mid')
#     for val in ['ClearCr', 'NridgHt']:
#         rename_value(df['Neighborhood'], val, 'high')
    


In [15]:
# Function to train a random forest model

def random_forest_model(df_processed, target):
    print("Building random forest model...")
    X_train, X_test, y_train, y_test = train_test_split(df_processed, target, test_size=0.2, random_state=23)
    
    parameters= {
        'max_depth':[12],
        'min_samples_leaf':[2],
        'min_samples_split':[2],
        'n_estimators':[150, 300, 500],
        'bootstrap':[True]
    }
    forest = RandomForestRegressor(n_jobs=4, random_state=42)
    optimized_model = GridSearchCV(forest, parameters)
    
    optimized_model.fit(X_train, y_train)
    
    print("The model's R2 score is: {}".format(optimized_model.best_estimator_.score(X_test, y_test)))
    return optimized_model.best_estimator_

# returns rf model object

In [16]:
# Function to train an XGBoost model

def xgboost_model(df_processed, target):
    
    print("Building xgboost model...")
    X_train, X_test, y_train, y_test = train_test_split(df_processed, target, test_size=0.2, random_state=23)
    
    parameters = {
        'learning_rate':[0.1, 0.3],
        'gamma':[0, 100],
        'max_depth':[3, 5, 7],
        'n_estimators':[300, 500]
    }
    
    xgbc = xgb.XGBRegressor(n_jobs=4, random_state=42)
    optimized_model = GridSearchCV(xgbc, parameters)
    
    optimized_model.fit(X_train, y_train)
    
    predictions = optimized_model.best_estimator_.predict(X_test)
    xgb_score = r2_score(y_test, predictions)
    
    print("The model's R2 score is: {}".format(xgb_score))
    return optimized_model.best_estimator_

In [17]:
# Function to obtain feature importances from a random forest model

def get_rf_feature_importances(rf_model, df):
    print("Gathering feature importances...")
    importances = pd.Series(data=rf_model.feature_importances_, index=df.columns)
    importances.sort_values(ascending=False, inplace=True)
    print("Featue importances obtained.")
    return importances

# returns series of feature importances

In [18]:
def combine_dataframes(df1, df2):
    df_combined = pd.concat([df1, df2], axis=1, sort=False)
    return df_combined

In [19]:
def generate_predictions(df_test, model):
    print("Generating predictions using the trained model...")
    numerical_features, categorical_features = get_numerical_categorical_lists(df_test)
    df_num_imputed, df_cat_imputed = impute_features(df_test[numerical_features].copy(), 
                                                     df_test[categorical_features].copy())
    df_combined_imputed = pd.concat([df_num_imputed, df_cat_imputed], axis=1, sort=False)
    prediction = model.predict(df_combined_imputed)
    print("Predictions complete.")
    return prediction

# returns a prediction array

In [20]:
# Function to output submission file

def create_submission(df_test, prediction):
    print("Generating submission file...")
    df_out = pd.DataFrame(data=prediction, columns=['SalePrice'], index=df_test.Id)
    df_out.to_csv('submission_v1.csv')
    print("Submission file is ready.")

In [21]:
### Main Code Below ###

In [22]:
# def run_model(model):
    
#     train_data, test_data = get_data()
#     target = train_data['SalePrice']
#     train_data = remove_null_columns(train_data)
#     numerical_features, categorical_features = get_feature_lists(train_data)
    
#     df_num_train, df_cat_train = impute_features(train_data[numerical_features].copy(), train_data[categorical_features].copy())
#     df_num_test, df_cat_test = impute_features(test_data[numerical_features].copy(), test_data[categorical_features].copy())
    
#     common_categories = prune_categorical_features(df_cat_train, df_cat_test)
    
#     df_train = combine_dataframes(df_num_train, df_cat_train[common_categories])
#     df_test = combine_dataframes(df_num_test, df_cat_test[common_categories])
    
#     df_train = feature_engineering(df_train.copy())
#     df_test = feature_engineering(df_test.copy())
    
#     predictions = model.predict(df_test)
    
#     create_submission(test_data, predictions)
    
#     return model, df_train

In [2]:
# model, df_train = run_model()

In [3]:
# model

In [4]:
# feature_importances = get_rf_feature_importances(df=df_train, rf_model=model)

In [5]:
# feature_importances