In [109]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # sum(y_real - y_predicted) / n
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [3]:
# Interface Settings
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:98% !important; margin-left:1% !important; margin-right:auto !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows', 70)

import warnings
warnings.filterwarnings("ignore")

In [4]:
home_data_file_path = 'train.csv'
home_data = pd.read_csv(home_data_file_path) 
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [127]:
# Shape of training data (num_rows, num_columns)
print(home_data.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (home_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1460, 81)
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [152]:
# Remove rows with missing target, separate target from predictors
home_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = home_data.SalePrice
home_data.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = home_data.select_dtypes(exclude=['object'])
X_test = home_data.select_dtypes(exclude=['object'])


KeyError: ['SalePrice']

In [153]:
X_test = X_test.drop(cols_with_missing, axis=1)

In [130]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Dealing with missing values

In [135]:
# Function for comparing different approaches
def score_dataset(X_t, X_v, y_t, y_v):
    model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
    model.fit(X_t, y_t)
    pds = model.predict(X_v)
    return mean_absolute_error(y_v, pds)

## Drop missing values

In [136]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
17756.108938356163


In [139]:
cols_with_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [142]:
X_test_missing = [col for col in X_test.columns if X_test[col].isnull().any()]
X_test_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

## Simple imputer for missing values

In [137]:
# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
18078.969143835613


## An Extension to Imputation

In [138]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An Extension to Imputation):
18089.499554794525


(1168, 7)
Series([], dtype: int64)


In [150]:
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)
model_6 = RandomForestRegressor(n_estimators=71, criterion='mae', random_state=0)
model_7 = RandomForestRegressor(n_estimators=71, criterion='mae', random_state=0, max_depth=13)
models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7,]

In [151]:
# Function for comparing different models
def score_model(model, X_t=reduced_X_train, X_v=reduced_X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 17891
Model 2 MAE: 17952
Model 3 MAE: 17756
Model 4 MAE: 18328
Model 5 MAE: 18731
Model 6 MAE: 17550
Model 7 MAE: 17790


In [78]:
def get_mae(n, X_train, X_valid, y_train, y_valid, n_jobs=8, depth=None):
    model = RandomForestRegressor(n_estimators=n, criterion='mae', random_state=0,n_jobs=-1, max_depth=depth)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return(mae)

In [146]:
max_estimators = range(21, 266, 2)

In [147]:
depth = range(9, 50)

In [145]:
# compare MAE with differing values of n_estimators
for n in max_estimators:
    my_mae = get_mae(n, reduced_X_train, reduced_X_valid, y_train, y_valid)
    print(F"MAX n_estimators: {n}\t Mean Absolute Error: {my_mae}")


MAX n_estimators: 21	 Mean Absolute Error: 18167.535225048923
MAX n_estimators: 23	 Mean Absolute Error: 18055.236599166168
MAX n_estimators: 25	 Mean Absolute Error: 17937.6401369863
MAX n_estimators: 27	 Mean Absolute Error: 17876.044647387112
MAX n_estimators: 29	 Mean Absolute Error: 17739.72472838923
MAX n_estimators: 31	 Mean Absolute Error: 17802.17962881131
MAX n_estimators: 33	 Mean Absolute Error: 17754.734848484848
MAX n_estimators: 35	 Mean Absolute Error: 17751.533953033268
MAX n_estimators: 37	 Mean Absolute Error: 17855.820251758607
MAX n_estimators: 39	 Mean Absolute Error: 17878.495345978223
MAX n_estimators: 41	 Mean Absolute Error: 17924.070998997664
MAX n_estimators: 43	 Mean Absolute Error: 17844.070404587448
MAX n_estimators: 45	 Mean Absolute Error: 17751.018645357686
MAX n_estimators: 47	 Mean Absolute Error: 17791.265083066162
MAX n_estimators: 49	 Mean Absolute Error: 17848.55053117137
MAX n_estimators: 51	 Mean Absolute Error: 17883.908071447757
MAX n_estimat

In [148]:
min_mae = {i: get_mae(i, reduced_X_train, reduced_X_valid, y_train, y_valid) for i in max_estimators}
best_n_estimators_size = min(min_mae, key=min_mae.get)

In [149]:
print(F'best_n_estimators_size is : {best_n_estimators_size}, \t MAE: {min_mae.get(best_n_estimators_size)}')

best_n_estimators_size is : 71, 	 MAE: 17550.365039552384


In [92]:
print(F'best_n_estimators_size is : {best_n_estimators_size}, \t MAE: {min_mae.get(best_n_estimators_size)}')

best_n_estimators_size is : 119, 	 MAE: 23400.577702313803


In [81]:
# compare MAE with differing values of max_depth
for d in depth:
    my_mae = get_mae(119, X_train, X_valid, y_train, y_valid, n_jobs=8, depth=d)
    print(F"max_depth: {d}\t Mean Absolute Error: {my_mae}")
    

max_depth: 2	 Mean Absolute Error: 35632.22050189939
max_depth: 3	 Mean Absolute Error: 29737.708904109597
max_depth: 4	 Mean Absolute Error: 26720.509381834927
max_depth: 5	 Mean Absolute Error: 24862.67464314493
max_depth: 6	 Mean Absolute Error: 23719.498244503284
max_depth: 7	 Mean Absolute Error: 23412.213595027053
max_depth: 8	 Mean Absolute Error: 23325.742790951997
max_depth: 9	 Mean Absolute Error: 23228.227351214464
max_depth: 10	 Mean Absolute Error: 23233.932600437434
max_depth: 11	 Mean Absolute Error: 23292.70445781052
max_depth: 12	 Mean Absolute Error: 23351.95795441464
max_depth: 13	 Mean Absolute Error: 23169.284563140325
max_depth: 14	 Mean Absolute Error: 23239.072191205247
max_depth: 15	 Mean Absolute Error: 23252.41333314148
max_depth: 16	 Mean Absolute Error: 23457.19061528721
max_depth: 17	 Mean Absolute Error: 23483.79109589041
max_depth: 18	 Mean Absolute Error: 23431.49838839646
max_depth: 19	 Mean Absolute Error: 23412.715235409236
max_depth: 20	 Mean Absolu

KeyboardInterrupt: 

In [160]:
reduced_X_train.isnull().sum()

Id               0
MSSubClass       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [164]:
test_X.isnull().sum()

Id               0
MSSubClass       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [166]:
test_data_path = 'test.csv'

test_data = pd.read_csv(test_data_path)

test_X = test_data.select_dtypes(exclude=['object'])
test_X = test_X.drop(cols_with_missing, axis=1)

test_X['BsmtFinSF1'].fillna((test_X['BsmtFinSF1'].mean()), inplace=True)
test_X['BsmtFinSF2'].fillna((test_X['BsmtFinSF2'].mean()), inplace=True)
test_X['BsmtUnfSF'].fillna((test_X['BsmtUnfSF'].mean()), inplace=True)
test_X['TotalBsmtSF'].fillna((test_X['TotalBsmtSF'].mean()), inplace=True)
test_X['BsmtFullBath'].fillna((test_X['BsmtFullBath'].mean()), inplace=True)
test_X['BsmtHalfBath'].fillna((test_X['BsmtHalfBath'].mean()), inplace=True)
test_X['GarageCars'].fillna((test_X['GarageCars'].mean()), inplace=True)
test_X['GarageArea'].fillna((test_X['GarageArea'].mean()), inplace=True)

test_preds = model_6.predict(test_X)

# The lines below shows how to save predictions in format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission_forest.csv', index=False)