# PREVIOUS WORK

In [1]:
# Start Python Imports
import math, time, random, datetime
from math import sqrt

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest
from fancyimpute import IterativeImputer
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, KFold, ShuffleSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

Using TensorFlow backend.


In [2]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [7]:
def fill_ii(df):
    df_filled_ii = pd.DataFrame(IterativeImputer().fit_transform(df.values))
    df_filled_ii.columns = df.columns
    df_filled_ii.index = df.index

    return df_filled_ii

# This function removes all observations that are more than
# three standard deviations away from the mean
def remove_outliers(df):
    '''
    numeric_features = train.select_dtypes(include=[np.number])
    print(len(numeric_features.columns))
    print(numeric_features.columns)
    fig, axes = plt.subplots(ncols=5, nrows=8, figsize=(16, 40))
    axes = np.ravel(axes)

    col_name = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
           'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
           'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
           'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
           'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
           'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
           'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
           'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
    for i, c in zip(range(38), col_name):
        train.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='r')
    '''
    df = df.drop(df[(df['LotArea']>100000)].index)
    df = df.drop(df[(df['BsmtFinSF1']>4000)].index)
    df = df.drop(df[(df['TotalBsmtSF']>4000)].index)
    df = df.drop(df[(df['1stFlrSF']>4000)].index)
    df = df.drop(df[(df['GrLivArea']>4000)].index)
    
    return df
        
def data_engineering(train, test):
    # Make train and test equal have the same shape
    train = train.drop(train.index[0])
    
    # Concatenate all of data
    cc_data = pd.concat([train, test], sort=True)
    cc_data = cc_data.drop(['Id', 'SalePrice','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
    
    # Get the SalePrice as the natural logarithm
    train["SalePrice"] = np.log1p(train["SalePrice"])
    y = train['SalePrice']
    
    # Remove outliers from data
    cc_data = remove_outliers(cc_data)
    
    # One-Hot encode all data
    cc_data = pd.get_dummies(cc_data, prefix_sep='_')
    
    # Impute all data, using IterativeImputer
    cc_data = fill_ii(cc_data)
    
    # Slice data, start to middle and middle to end
    X_train = cc_data[:train.shape[0]]
    X_test = cc_data[train.shape[0]:]
    
    return X_train,X_test,y

# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

df_train,df_test,y = data_engineering(train,test)

# RANDOM FOREST WORK

In [8]:
opt_params = []
dic_values_to_array = lambda dic: {key: [dic[key]]  for key in dic}
def random_forest_validation(X_train,y_train,X_test,params_grid,cv,validate = True, predict_train = False):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid=params_grid, cv=cv,#cv=ShuffleSplit(test_size=0.10, n_splits =1, random_state=0),
        n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_train)
    
    if not predict_train:
        pred = model.predict(X_test)
    else:
        pred = model.predict(X_train)
    
    score = -model.best_score_
    
    if validate:
        return dic_values_to_array(model.best_params_)
    elif predict_train:
        return score
    else:
        return pred,score

def nested_cv(train_data,test_data,y,outer,inner):
    '''
    This function runs nested cross-validation, where the optimal parameters
    for a random forest algorithm is searched for in the inner loop and
    applied in the outer loop.
    
    train_data: your training data
    test_data: your testing data
    y: your output variable
    outer: how many k-folds we split the data in the outer loop
    inner: how many k-folds we split the data in the inner loop
    '''
    
    # Define Cross-Validation with k-outer folds
    CV = KFold(outer, shuffle=True)
    
    # Generalization Error array used for calculating
    generalization_error = np.zeros(outer)
    training_error = np.zeros(outer)
    
    # Split training data and output variable
    for (i, (train_index, test_index)) in enumerate(CV.split(train_data,y)):
        print('\n{0}/{1} <-- Current outer fold'.format(i+1,outer))
        
        # Split data into training, output variable and test data
        X_validation = train_data[:train_index.shape[0]]
        y_validation = y[:train_index.shape[0]]
        X_test = train_data[:test_index.shape[0]]
        
        # Use this if you don't have a test dataset
        #X_test_outer = train_data[:test_index.shape[0]]
        
        # Define parameters for optimization
        params_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 20, 30, 50, 100, 200, 400, 600, 800, 1000)
        }
        
        # Use inner data to find optimal parameters for RF
        opt_params = random_forest_validation(X_validation,y_validation,X_test,params_grid,inner)
        
        # Run RF with optimal parameters
        cv=ShuffleSplit(test_size=0.10, n_splits =1, random_state=0)
        pred,generalization_error[i] = random_forest_validation(X_validation,y_validation,test_data,opt_params,cv,validate = False)
        
        print("Optimal parameters: {0}".format(opt_params))
        print("Mean Test Error: {0}".format(generalization_error[i]))
        
        training_error[i] = random_forest_validation(train_data,y,test_data,opt_params,cv,validate = False, predict_train = True)
        print("Mean Training error: {0}".format(training_error[i]))
        
    print("\nGeneralization Error as RMSLE: {0}".format(sqrt(np.mean(generalization_error))))
    print("Generalization Training Error as RMSLE: {0}".format(sqrt(np.mean(training_error))))
    
nested_cv(df_train,df_test,y,5,5)


1/5 <-- Current outer fold
Optimal parameters: {'max_depth': [None], 'n_estimators': [1000]}
Mean Test Error: 0.02952111107007361
Mean Training error: 0.02912009854388337

2/5 <-- Current outer fold
Optimal parameters: {'max_depth': [None], 'n_estimators': [600]}
Mean Test Error: 0.029267328640661
Mean Training error: 0.029963228896132493

3/5 <-- Current outer fold
Optimal parameters: {'max_depth': [None], 'n_estimators': [800]}
Mean Test Error: 0.029727113662829967
Mean Training error: 0.029673495837410597

4/5 <-- Current outer fold
Optimal parameters: {'max_depth': [None], 'n_estimators': [1000]}
Mean Test Error: 0.02949937464158821
Mean Training error: 0.029095922233418246

5/5 <-- Current outer fold
Optimal parameters: {'max_depth': [None], 'n_estimators': [50]}
Mean Test Error: 0.0302780873974049
Mean Training error: 0.02963958322819895

Generalization Error as RMSLE: 0.1722167328760813
Generalization Training Error as RMSLE: 0.17175117393429581


In [None]:
'''
def random_forest_prediction(X_train,X_test,y_real):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 30, 50, 100, 200, 400, 600, 800, 1000),
            'max_features': (2,4,6)
        }, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_real)
    pred = model.predict(X_test)
    score = sqrt(-model.best_score_)
    
    # return all predictions and mean of all cross validated scores
    return pred, score, model

df_train,df_test,y = data_engineering(train,test)
#selected_features = feature_selection(df_train, y, 50)

#pred,score, model = random_forest_prediction(df_train, df_test, y)
'''