# PREVIOUS WORK

In [2]:
# Start Python Imports
import math, time, random, datetime
from math import sqrt

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest
from fancyimpute import IterativeImputer
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

Using TensorFlow backend.


In [3]:
def fill_missing_or_nan_values(df):
    fill_with = 0
    
    # Get the most common element by using size(),
    # which returns the element and how common it is
    for column in df:
        
        # Check if the column is an object, float64 or int64
        is_it_float = (df[column].dtype == np.float64)
        is_it_int = (df[column].dtype == np.int64)
        
        # If it is an object,
        # find the most common element and fill missing and NaN values
        if(not is_it_float and not is_it_int):
            fill_with = df[column].mode().item()
                    
        # If it is either a float64 or int64,
        # then calculate the mean and fill missing and NaN values
        else:
            if is_it_float:
                fill_with = np.nanmean(df[column], dtype=np.float64)
            if is_it_int:
                fill_with = np.nanmean(df[column], dtype=np.int64)
        
        # Fill the values in our dataset
        df[column] = df[column].fillna(fill_with)
        fill_with = 0

def fill_ii(df):
    df_filled_ii = pd.DataFrame(IterativeImputer().fit_transform(df.as_matrix()))
    df_filled_ii.columns = df.columns
    df_filled_ii.index = df.index

    return df_filled_ii

def pca(df):
    pca = PCA(.95)
    df = pca.fit_transform(df)
    return df

# This function removes all observations that are more than
# three standard deviations away from the mean
def remove_outliers(df):
    '''
    numeric_features = train.select_dtypes(include=[np.number])
    print(len(numeric_features.columns))
    print(numeric_features.columns)
    fig, axes = plt.subplots(ncols=5, nrows=8, figsize=(16, 40))
    axes = np.ravel(axes)

    col_name = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
           'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
           'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
           'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
           'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
           'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
           'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
           'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
    for i, c in zip(range(38), col_name):
        train.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='r')
    '''
    df = df.drop(df[(df['LotArea']>100000)].index)
    df = df.drop(df[(df['BsmtFinSF1']>4000)].index)
    df = df.drop(df[(df['TotalBsmtSF']>4000)].index)
    df = df.drop(df[(df['1stFlrSF']>4000)].index)
    df = df.drop(df[(df['GrLivArea']>4000)].index)
    
    return df
        
def data_engineering(train, test):
    # Concatenate all of data
    cc_data = pd.concat([train, test])
    cc_data = cc_data.drop(['Id', 'SalePrice','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
    
    # Get the SalePrice as the natural logarithm
    train["SalePrice"] = np.log1p(train["SalePrice"])
    y = train['SalePrice']
    
    # Remove outliers from data
    #cc_data = remove_outliers(cc_data)
    
    # One-Hot encode all data
    cc_data = pd.get_dummies(cc_data, prefix_sep='_')
    
    # Impute all data, using IterativeImputer
    cc_data = fill_ii(cc_data)
    
    # Slice data, train.shape[0] is the observations
    # 1) from start to middle of observations
    # 2) from middle of observations to end
    X_train = cc_data[:train.shape[0]]
    X_test = cc_data[train.shape[0]:]
    
    return X_train,X_test,y

# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

# RANDOM FOREST WORK

In [188]:
def random_forest_validation(X_train,y_train,X_test,y_test,params_grid):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid=params_grid, n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_train)
    pred = model.predict(X_test)
    score = -model.best_score_
    
    array_conversion = []
    for i in model.best_params_.values():
        array_conversion.append(i)
    
    return score,array_conversion

def nested_cv(df,y,outer,inner):
    CV = KFold(outer, shuffle=True)
    for (i, (train_index, test_index)) in enumerate(CV.split(df,y)):
        print('\nCrossvalidation outer fold: {0}/{1}'.format(i+1,outer))
        X_train_outer = df[train_index.shape[0]:]
        y_train_outer = y[train_index.shape[0]:]
        X_test_outer = df[test_index.shape[0]:]
        y_test_outer = y[test_index.shape[0]:]
        
        opt_params = np.empty(inner)
        print(opt_params)
        mse_validation = np.zeros(inner)
        params_grid={
                'max_depth': [3, None],
                'n_estimators': (10, 20), #30, 50, 100, 200, 400, 600, 800, 1000),
                'max_features': (2,4,6)
        }
        for (j, (train_index_inner, test_index_inner)) in enumerate(CV.split(X_train_outer,y_train_outer)):
            print('\nCrossvalidation inner fold: {0}/{1}'.format(j+1,inner))
            X_train_inner = X_train_outer[train_index_inner.shape[0]:]
            y_train_inner = y_train_outer[train_index_inner.shape[0]:]
            X_test_inner = X_train_outer[test_index_inner.shape[0]:]
            y_test_inner = y_train_outer[test_index_inner.shape[0]:]
            
            mse_validation[j], opt_params[j][0] = random_forest_validation(X_train_inner,y_train_inner,X_test_inner,y_test_inner,params_grid)
            
        print(mse_validation)
        print(opt_params)
        
nested_cv(df_train,y,5,5)


Crossvalidation outer fold: 1/5
[[7.03339762e-312]
 [1.27032661e+000]
 [7.02389722e-312]
 [7.02389722e-312]
 [1.49166815e-154]]

Crossvalidation inner fold: 1/5




[None, 6, 20]
2.5248751061650555e-05


ValueError: setting an array element with a sequence.

In [36]:
def random_forest_prediction(X_train,X_test,y_real):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 30, 50, 100, 200, 400, 600, 800, 1000),
            'max_features': (2,4,6)
        }, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_real)
    pred = model.predict(X_test)
    score = sqrt(-model.best_score_)
    
    # return all predictions and mean of all cross validated scores
    return pred, score, model

df_train,df_test,y = data_engineering(train,test)
#selected_features = feature_selection(df_train, y, 50)

pred,score,model = random_forest_prediction(df_train, df_test, y)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [55]:
print(model.best_estimator_)
print(model.best_estimator_.n_estimators)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=600, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
600


In [4]:
# To convert prediction of SalePrice into the actual value, we take exponential value
print(np.expm1(pred))
print(score)

[127251.54702308 154956.43728829 182879.14145627 ... 151510.37138696
 121843.09356488 215477.78201348]
0.15594761917692734
