# PREVIOUS WORK

In [53]:
# Start Python Imports
import math, time, random, datetime
from math import sqrt

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from scipy.linalg import svd
import sklearn
from sklearn.feature_selection import SelectKBest
from fancyimpute import IterativeImputer
from scipy.stats import zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, KFold, ShuffleSplit
from sklearn.preprocessing import MinMaxScaler

# Load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [54]:
def fill_missing_or_nan_values(df):
    fill_with = 0
    
    # Get the most common element by using size(),
    # which returns the element and how common it is
    for column in df:
        
        # Check if the column is an object, float64 or int64
        is_it_float = (df[column].dtype == np.float64)
        is_it_int = (df[column].dtype == np.int64)
        
        # If it is an object,
        # find the most common element and fill missing and NaN values
        if(not is_it_float and not is_it_int):
            fill_with = df[column].mode().item()
                    
        # If it is either a float64 or int64,
        # then calculate the mean and fill missing and NaN values
        else:
            if is_it_float:
                fill_with = np.nanmean(df[column], dtype=np.float64)
            if is_it_int:
                fill_with = np.nanmean(df[column], dtype=np.int64)
        
        # Fill the values in our dataset
        df[column] = df[column].fillna(fill_with)
        fill_with = 0

def fill_ii(df):
    df_filled_ii = pd.DataFrame(IterativeImputer().fit_transform(df.values))
    df_filled_ii.columns = df.columns
    df_filled_ii.index = df.index

    return df_filled_ii

def pca(df):
    pca = PCA(.95)
    df = pca.fit_transform(df)
    return df

# This function removes all observations that are more than
# three standard deviations away from the mean
def remove_outliers(df):
    '''
    numeric_features = train.select_dtypes(include=[np.number])
    print(len(numeric_features.columns))
    print(numeric_features.columns)
    fig, axes = plt.subplots(ncols=5, nrows=8, figsize=(16, 40))
    axes = np.ravel(axes)

    col_name = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
           'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
           'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
           'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
           'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
           'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
           'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
           'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
    for i, c in zip(range(38), col_name):
        train.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='r')
    '''
    df = df.drop(df[(df['LotArea']>100000)].index)
    df = df.drop(df[(df['BsmtFinSF1']>4000)].index)
    df = df.drop(df[(df['TotalBsmtSF']>4000)].index)
    df = df.drop(df[(df['1stFlrSF']>4000)].index)
    df = df.drop(df[(df['GrLivArea']>4000)].index)
    
    return df
        
def data_engineering(train, test):
    # Make train and test equal have the same shape
    train = train.drop(train.index[0])
    
    # Concatenate all of data
    cc_data = pd.concat([train, test], sort=False)
    cc_data = cc_data.drop(['Id', 'SalePrice','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
    
    # Get the SalePrice as the natural logarithm
    train["SalePrice"] = np.log1p(train["SalePrice"])
    y = train['SalePrice']
    
    # Remove outliers from data
    #cc_data = remove_outliers(cc_data)
    
    # One-Hot encode all data
    cc_data = pd.get_dummies(cc_data, prefix_sep='_')
    
    # Impute all data, using IterativeImputer
    cc_data = fill_ii(cc_data)
    #return cc_data,y
    # Slice data, train.shape[0] is the observations
    # 1) from start to middle of observations
    # 2) from middle of observations to end
    X_train = cc_data[:train.shape[0]]
    X_test = cc_data[train.shape[0]:]
    
    #X_train = X_train.drop(X_train.index[0])
    #y = y.drop(y.index[0])
    
    return X_train,X_test,y

# X is dataframe, y is output, m is how many features you want selected
# returns array of highest scoring features
def feature_selection(X, y, m):
    # Data is standardized here, minus mean and divided by standard deviation
    # The correlation between each regressor and the target is computed
    # It is converted to an F score then to a p-value, which is returned
    f_regression = lambda X,y : sklearn.feature_selection.f_regression(X,y,center=False)

    # removes all but the  highest scoring features
    featureSelector = SelectKBest(score_func=f_regression,k=m)
    featureSelector.fit(X,y)
    high_score_arr = [X.columns[1+zero_based_index] for zero_based_index in list(featureSelector.get_support(indices=True))]
    
    return high_score_arr

df_train,df_test,y = data_engineering(train,test)

# RANDOM FOREST WORK

In [65]:
opt_params = []
mse_validation = []
dic_values_to_array = lambda dic: {key: [dic[key]]  for key in dic}
def random_forest_validation(X_train,y_train,X_test,params_grid,validate = True):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid=params_grid, cv=ShuffleSplit(test_size=0.10, n_splits =1, random_state=0),
        n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_train)
    pred = model.predict(X_test)
    score = -model.best_score_
    
    if validate:
        opt_params.append(dic_values_to_array(model.best_params_))
        mse_validation.append(score)
        return mse_validation,opt_params
    else:
        return pred,score

def nested_cv(train_data,test_data,y,outer,inner):
    '''
    This function runs nested cross-validation, where the optimal parameters
    for a random forest algorithm is searched for in the inner loop and
    applied in the outer loop.
    
    train_data: your training data
    test_data: your testing data
    y: your output variable
    outer: how many k-folds we split the data in the outer loop
    inner: how many k-folds we split the data in the inner loop
    '''
    CV = KFold(outer, shuffle=True)
    generalization_error = np.zeros(outer)
    #
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    #
    for (i, (train_index, test_index)) in enumerate(CV.split(train_data,test_data,y)):
        print('\nCrossvalidation outer fold: {0}/{1}'.format(i+1,outer))
        X_train_outer = train_data[train_index.shape[0]:]
        y_train_outer = y[train_index.shape[0]:]
        X_test_outer = test_data[test_index.shape[0]:]
        y_test_outer = y[test_index.shape[0]:]
        print("X_train_outer",X_train_outer.shape[0])
        print("y_train_outer",y_train_outer.shape[0])
        print("X_test_outer",X_test_outer.shape[0])
        print("y_test_outer",y_test_outer.shape[0])
        
        params_grid={
                'max_depth': [3, None],
                'n_estimators': (10, 20),# 30, 50, 100, 200, 400, 600, 800, 1000),
                'max_features': (5,10,15,20)
        }
        
        opt_params = []
        mse_validation = []
        for (j, (train_index_inner, test_index_inner)) in enumerate(CV.split(X_train_outer,y_train_outer)):
            X_train_inner = X_train_outer[train_index_inner.shape[0]:]
            y_train_inner = y_train_outer[train_index_inner.shape[0]:]
            X_test_inner = X_train_outer[test_index_inner.shape[0]:]
            y_test_inner = y_train_outer[test_index_inner.shape[0]:]
            mse_validation,opt_params = random_forest_validation(X_train_inner,y_train_inner,X_test_inner,params_grid)
        
        lowest_mse = 999999999
        for idx,mse in enumerate(mse_validation):
            if mse < lowest_mse:
                lowest_mse = mse
                params_grid = opt_params[idx]
        pred, score = random_forest_validation(X_train_outer,y_train_outer,X_test_outer,params_grid, validate=False)
        generalization_error[i] = score
        print("Optimal parameters for algorithm:\n{0}".format(params_grid))
    print(generalization_error)
    print("Generalization Error with RMSLE as scoring: {0}".format(sqrt(np.mean(generalization_error))))
nested_cv(df_train,df_test,y,10,10)


Crossvalidation outer fold: 1/10
X_train_outer 146
y_train_outer 146
X_test_outer 1313
y_test_outer 1313
Optimal parameters for algorithm:
{'max_depth': [3], 'max_features': [5], 'n_estimators': [20]}

Crossvalidation outer fold: 2/10
X_train_outer 146
y_train_outer 146
X_test_outer 1313
y_test_outer 1313
Optimal parameters for algorithm:
{'max_depth': [3], 'max_features': [5], 'n_estimators': [20]}

Crossvalidation outer fold: 3/10
X_train_outer 146
y_train_outer 146
X_test_outer 1313
y_test_outer 1313
Optimal parameters for algorithm:
{'max_depth': [3], 'max_features': [5], 'n_estimators': [20]}

Crossvalidation outer fold: 4/10
X_train_outer 146
y_train_outer 146
X_test_outer 1313
y_test_outer 1313
Optimal parameters for algorithm:
{'max_depth': [3], 'max_features': [5], 'n_estimators': [20]}

Crossvalidation outer fold: 5/10
X_train_outer 146
y_train_outer 146
X_test_outer 1313
y_test_outer 1313
Optimal parameters for algorithm:
{'max_depth': [3], 'max_features': [5], 'n_estimator

In [None]:
def random_forest_prediction(X_train,X_test,y_real):
    gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': [3, None],
            'n_estimators': (10, 30, 50, 100, 200, 400, 600, 800, 1000),
            'max_features': (2,4,6)
        }, cv=10, n_jobs=-1, scoring='neg_mean_squared_error'
    )
    model = gs.fit(X_train,y_real)
    pred = model.predict(X_test)
    score = sqrt(-model.best_score_)
    
    # return all predictions and mean of all cross validated scores
    return pred, score, model

df_train,df_test,y = data_engineering(train,test)
#selected_features = feature_selection(df_train, y, 50)

#pred,score, model = random_forest_prediction(df_train, df_test, y)

In [5]:
print(model.best_estimator_)
print(model.best_estimator_.n_estimators)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
200


In [6]:
# To convert prediction of SalePrice into the actual value, we take exponential value
print(np.expm1(pred))
print(score)

[11.76359381 11.96945027 12.1351863  ... 11.96083966 11.71628178
 12.28194491]
0.01200205301101093
