In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [37]:
# load data
Data = pd.read_csv('train.csv')
train_outputs = Data['SalePrice']
Data = Data.drop(['SalePrice'], axis=1)

def clean_data(Data):
    # For some features, 'NA' is an acceptable value, so change the pd.NA 
    # for those to 'NA' where necessary
    # list of columns that are supposed to have NA
    acceptable_na = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                     'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                     'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

    for feature in acceptable_na:
        Data[feature].fillna('NA', inplace=True)

    remove_columns = ['Id', 'Utilities', 'Condition1', 'Condition2', 'PoolArea', 'PoolQC', 'MiscFeature']
    Data = Data.drop(remove_columns, axis=1)

    bsmt_cols = [col for col in Data if col.startswith('Bsmt')]
    bsmt_cols.append('TotalBsmtSF')
    bsmt_data = Data[bsmt_cols]

    cats1 = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
    encoder1 = preprocessing.OrdinalEncoder(categories=[cats1])

    bsmt_data['BsmtQual'] = encoder1.fit_transform(bsmt_data['BsmtQual'].to_numpy().reshape(-1,1))

    Data = Data.drop(bsmt_cols, axis=1)
    Data['BsmtQual'] = bsmt_data['BsmtQual']
    Data['TotalBsmtSF'] = bsmt_data['TotalBsmtSF']

    gar_cols = [col for col in Data if col.startswith('Garage')]
    gar_data = Data[gar_cols]
    # fill missing data for garage year built with the median year
    gar_data['GarageYrBlt'].fillna(gar_data['GarageYrBlt'].median(), inplace=True)

    Data = Data.drop(gar_cols, axis=1)
    Data['GarageYrBlt'] = gar_data['GarageYrBlt']
    Data['GarageCars'] = gar_data['GarageCars']
    Data['GarageArea'] = gar_data['GarageArea']

    one_hot_cols = ['MSZoning', 'Street', 'Alley', 'LotConfig', 'BldgType',
                    'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'SaleCondition']
    Data['MasVnrType'] = Data['MasVnrType'].replace(np.nan, 'None')
    Data['Electrical'] = Data['Electrical'].replace(np.nan, 'None')
    one_hot_enc = preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False)
    temp = pd.DataFrame(
        one_hot_enc.fit_transform(Data[one_hot_cols]),
        columns=one_hot_enc.get_feature_names(one_hot_cols)
    )
    Data = Data.drop(one_hot_cols, axis=1)
    Data = pd.concat([Data, temp], axis=1)

    unsure_cols = ['MSSubClass', 'LandContour', 'Neighborhood', 'Exterior1st', 'Exterior2nd',
                   'Fence', 'SaleType', 'HouseStyle', 'Functional']
    Data = Data.drop(unsure_cols, axis=1)

    ords = [
        'LotShape', 'LandSlope', 'ExterQual', 'ExterCond', 'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu',
        'PavedDrive'
    ]
    lot_shape_cats = ['IR3', 'IR2', 'IR1', 'Reg']
    land_slope_cats = ['Gtl', 'Mod', 'Sev']
    qual_cond_cats = ['Po', 'Fa', 'TA', 'Gd', 'Ex'] #x3
    central_air_cats = ['N', 'Y'] 
    # qual_cond_cats again
    qual_cond_na_cats = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
    paved_drive_cats = ['N', 'P', 'Y']

    CATS = [lot_shape_cats, land_slope_cats, qual_cond_cats, qual_cond_cats, qual_cond_cats, central_air_cats,
           qual_cond_cats, qual_cond_na_cats, paved_drive_cats]

    encoder1 = preprocessing.OrdinalEncoder(categories=CATS)
    Data[ords] = encoder1.fit_transform(Data[ords])

    Data['MasVnrArea'] = Data['MasVnrArea'].replace(np.nan, 0)

    Data['NumBath'] = Data.FullBath + .5*Data.HalfBath
    Data = Data.drop(['FullBath', 'HalfBath'], axis=1)

    Data['DateSold'] = Data.YrSold + (1/12)*Data.MoSold
    Data = Data.drop(['YrSold', 'MoSold'], axis=1)

    # fill missing data for garage year built with the median year
    Data['LotFrontage'].fillna(Data['LotFrontage'].median(), inplace=True)
    
    for col in Data.columns:
        if Data[col].isnull().values.any():
            Data[col].fillna(Data[col].median(), inplace=True)
    
    return Data

Data = clean_data(Data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsmt_data['BsmtQual'] = encoder1.fit_transform(bsmt_data['BsmtQual'].to_numpy().reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [38]:
Data

Unnamed: 0,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,Electrical_None,Electrical_SBrkr,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,NumBath,DateSold
0,65.0,8450,3.0,0.0,7,5,2003,2003,196.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,2008.166667
1,80.0,9600,3.0,0.0,6,8,1976,1976,0.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2007.416667
2,68.0,11250,2.0,0.0,7,5,2001,2002,162.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,2008.750000
3,60.0,9550,2.0,0.0,7,5,1915,1970,0.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.166667
4,84.0,14260,2.0,0.0,8,5,2000,2000,350.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,2009.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,3.0,0.0,6,5,1999,2000,0.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.5,2007.666667
1456,85.0,13175,3.0,0.0,6,6,1978,1988,119.0,2.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2010.166667
1457,66.0,9042,3.0,0.0,7,9,1941,2006,0.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2010.416667
1458,68.0,9717,3.0,0.0,5,6,1950,1996,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2010.333333


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()

In [25]:
reg = GridSearchCV(
    estimator=rf_reg,
    param_grid={
        'n_estimators': [50, 100, 150],
        'criterion': ['mse', 'mae'],
        'min_samples_split': [2, 4, 8],
        'min_samples_leaf': [1, 2, 4]
    },
    n_jobs=-1
)

In [26]:
reg.fit(Data,train_outputs)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 4, 8],
                         'n_estimators': [50, 100, 150]})

In [27]:
reg.cv_results_

{'mean_fit_time': array([ 0.74042134,  1.4733438 ,  2.21881924,  0.65676003,  1.31593351,
         1.97378473,  0.57965717,  1.162606  ,  1.75384021,  0.61933079,
         1.23677039,  1.85047402,  0.61825857,  1.22895923,  1.85859971,
         0.55454297,  1.09961634,  1.68519745,  0.51049528,  1.01970153,
         1.5366837 ,  0.51044183,  1.02419791,  1.53564854,  0.51230264,
         1.01601715,  1.52354574, 10.23195443, 20.41934085, 30.63852539,
        10.00761518, 20.07615061, 30.14794993,  9.70747747, 19.43351822,
        29.13558722,  8.73167534, 17.38640122, 26.22050648,  8.72461405,
        17.48721476, 26.26431475,  8.62615614, 17.10209165, 25.58232255,
         7.37750926, 14.81550875, 22.14520917,  7.39030137, 14.76880488,
        19.4095448 ,  7.40996423, 12.60256219, 16.16491237]),
 'std_fit_time': array([0.00539901, 0.01240817, 0.01614257, 0.00585629, 0.01862906,
        0.0116895 , 0.00228298, 0.01337231, 0.00815674, 0.01195457,
        0.00759323, 0.01424722, 0.00963

In [28]:
reg.best_estimator_

RandomForestRegressor(criterion='mae', min_samples_split=4)

In [40]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor()

In [41]:
reg2 = GridSearchCV(
    estimator=gb_reg,
    param_grid={
        'loss': ['ls', 'lad', 'huber', 'quantile'],
        'learning_rate': [.01, .1, .2],
        'n_estimators': [100, 150, 200],
        'min_samples_leaf': [1, 2, 4]
    },
    n_jobs=-1
)

In [42]:
reg2.fit(Data,train_outputs)

GridSearchCV(estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1, 0.2],
                         'loss': ['ls', 'lad', 'huber', 'quantile'],
                         'min_samples_leaf': [1, 2, 4],
                         'n_estimators': [100, 150, 200]})

In [43]:
reg2.cv_results_

{'mean_fit_time': array([0.53050075, 0.80108018, 1.03759928, 0.53545127, 0.80071311,
        1.05165577, 0.54840441, 0.78908687, 1.04167299, 0.65962048,
        0.99053173, 1.3232121 , 0.67607751, 0.99678664, 1.33715219,
        0.67079105, 1.00147257, 1.34272408, 0.73695641, 1.10127344,
        1.46960378, 0.71972737, 1.09510794, 1.46022515, 0.73216758,
        1.07254601, 1.43908911, 0.68015704, 1.00385518, 1.32523861,
        0.67816739, 0.99568257, 1.334904  , 0.67029014, 0.98946009,
        1.30807166, 0.5292521 , 0.8056879 , 1.04652991, 0.54721942,
        0.79234376, 1.06160855, 0.52335925, 0.78101902, 1.04966807,
        0.65672603, 0.95982056, 1.2961494 , 0.66475306, 0.96451173,
        1.2832202 , 0.65960712, 0.97743573, 1.31345925, 0.71481428,
        1.04964881, 1.41984034, 0.70978065, 1.0967082 , 1.41485519,
        0.72996416, 1.04095812, 1.41016264, 0.67941713, 0.95909653,
        1.25946546, 0.6459918 , 0.96688857, 1.30279131, 0.65668721,
        0.99672441, 1.28604884,

In [44]:
reg2.best_estimator_

GradientBoostingRegressor(learning_rate=0.2, loss='huber')

GB was better so far

## Apply model to test data for submission

In [39]:
# load data
Test = pd.read_csv('test.csv')

Test = clean_data(Test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsmt_data['BsmtQual'] = encoder1.fit_transform(bsmt_data['BsmtQual'].to_numpy().reshape(-1,1))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


ValueError: Input contains NaN