In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # sum(y_real - y_predicted) / n
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
# Interface Settings
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:98% !important; margin-left:1% !important; margin-right:auto !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows', 70)

import warnings
warnings.filterwarnings("ignore")

In [4]:
home_data_file_path = 'train.csv'
home_data = pd.read_csv(home_data_file_path) 
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
y = home_data.SalePrice

In [6]:
home_data_features = ['LotArea', 'YearBuilt','1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[home_data_features]

In [7]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [8]:
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

RandomForestRegressor(random_state=1)

23009.206570906717


In [82]:
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)
model_6 = RandomForestRegressor(n_estimators=119, criterion='mae', random_state=0)
model_7 = RandomForestRegressor(n_estimators=119, criterion='mae', random_state=0, max_depth=13)
models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7,]

In [83]:
# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706
Model 6 MAE: 23400
Model 7 MAE: 23169


In [78]:
def get_mae(n, X_train, X_valid, y_train, y_valid, n_jobs=8, depth=None):
    model = RandomForestRegressor(n_estimators=n, criterion='mae', random_state=0,n_jobs=-1, max_depth=depth)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return(mae)

In [71]:
max_estimators = range(21, 250, 2)

In [72]:
# compare MAE with differing values of n_estimators
for n in max_estimators:
    my_mae = get_mae(n, X_train, X_valid, y_train, y_valid)
    print(F"MAX n_estimators: {n}\t Mean Absolute Error: {my_mae}")
    


MAX n_estimators: 21	 Mean Absolute Error: 23849.173515981733
MAX n_estimators: 23	 Mean Absolute Error: 23963.328171530673
MAX n_estimators: 25	 Mean Absolute Error: 23928.816027397264
MAX n_estimators: 27	 Mean Absolute Error: 23755.258371385087
MAX n_estimators: 29	 Mean Absolute Error: 23773.118091639113
MAX n_estimators: 31	 Mean Absolute Error: 23835.16725585506
MAX n_estimators: 33	 Mean Absolute Error: 23904.04618098796
MAX n_estimators: 35	 Mean Absolute Error: 23752.092954990218
MAX n_estimators: 37	 Mean Absolute Error: 23599.283783783787
MAX n_estimators: 39	 Mean Absolute Error: 23588.068405338952
MAX n_estimators: 41	 Mean Absolute Error: 23553.68877380555
MAX n_estimators: 43	 Mean Absolute Error: 23477.556944886906
MAX n_estimators: 45	 Mean Absolute Error: 23547.739421613394
MAX n_estimators: 47	 Mean Absolute Error: 23604.67494899446
MAX n_estimators: 49	 Mean Absolute Error: 23599.37783058429


KeyboardInterrupt: 

In [41]:
min_mae = {i: get_mae(i, X_train, X_valid, y_train, y_valid) for i in max_estimators}
best_n_estimators_size = min(min_mae, key=min_mae.get)

In [42]:
print(F'best_n_estimators_size is : {best_n_estimators_size}, \t MAE: {min_mae.get(best_n_estimators_size)}')

best_n_estimators_size is : 119, 	 MAE: 23400.577702313803


In [79]:
depth = range(2, 150)

In [81]:
# compare MAE with differing values of max_depth
for d in depth:
    my_mae = get_mae(119, X_train, X_valid, y_train, y_valid, n_jobs=8, depth=d)
    print(F"max_depth: {d}\t Mean Absolute Error: {my_mae}")
    

max_depth: 2	 Mean Absolute Error: 35632.22050189939
max_depth: 3	 Mean Absolute Error: 29737.708904109597
max_depth: 4	 Mean Absolute Error: 26720.509381834927
max_depth: 5	 Mean Absolute Error: 24862.67464314493
max_depth: 6	 Mean Absolute Error: 23719.498244503284
max_depth: 7	 Mean Absolute Error: 23412.213595027053
max_depth: 8	 Mean Absolute Error: 23325.742790951997
max_depth: 9	 Mean Absolute Error: 23228.227351214464
max_depth: 10	 Mean Absolute Error: 23233.932600437434
max_depth: 11	 Mean Absolute Error: 23292.70445781052
max_depth: 12	 Mean Absolute Error: 23351.95795441464
max_depth: 13	 Mean Absolute Error: 23169.284563140325
max_depth: 14	 Mean Absolute Error: 23239.072191205247
max_depth: 15	 Mean Absolute Error: 23252.41333314148
max_depth: 16	 Mean Absolute Error: 23457.19061528721
max_depth: 17	 Mean Absolute Error: 23483.79109589041
max_depth: 18	 Mean Absolute Error: 23431.49838839646
max_depth: 19	 Mean Absolute Error: 23412.715235409236
max_depth: 20	 Mean Absolu

KeyboardInterrupt: 

In [84]:
test_data_path = 'test.csv'

test_data = pd.read_csv(test_data_path)

test_X = test_data[home_data_features]

test_preds = model_7.predict(test_X)

# The lines below shows how to save predictions in format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission_forest.csv', index=False)