## Build a model to predict housing prices 
### Based on Kaggle challege 
#### https://www.kaggle.com/c/home-data-for-ml-course/overview
##### https://www.kaggle.com/code/brittanysuttner/exercise-machine-learning-competitions/edit


In [64]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [43]:
#path of files to read
train = './train.csv'
test = './test.csv'
meta = './data_description.txt'

train_data = pd.read_csv(train)
# Test data does NOT contain SalePrice, need to predict this after creating a model
test_data = pd.read_csv(test)

train_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [69]:
#set prediction target variable
y = train_data.SalePrice

In [45]:
#Check out the features of the dataset.
#train_data.columns

In [37]:
# Select some features to use for the preliminary model bsaed on best guess
house_features = ["MSSubClass","LotArea","OverallQual","YearBuilt","1stFlrSF","2ndFlrSF","TotRmsAbvGrd"]

In [46]:
X = train_data[house_features]

In [47]:
X.describe()

Unnamed: 0,MSSubClass,LotArea,OverallQual,YearBuilt,1stFlrSF,2ndFlrSF,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,10516.828082,6.099315,1971.267808,1162.626712,346.992466,6.517808
std,42.300571,9981.264932,1.382997,30.202904,386.587738,436.528436,1.625393
min,20.0,1300.0,1.0,1872.0,334.0,0.0,2.0
25%,20.0,7553.5,5.0,1954.0,882.0,0.0,5.0
50%,50.0,9478.5,6.0,1973.0,1087.0,0.0,6.0
75%,70.0,11601.5,7.0,2000.0,1391.25,728.0,7.0
max,190.0,215245.0,10.0,2010.0,4692.0,2065.0,14.0


In [70]:
#Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

#Build simple decision tree model
iowa_model = DecisionTreeRegressor(random_state=1)

#Fit the model
iowa_model.fit(train_X,train_y)

DecisionTreeRegressor(random_state=1)

In [71]:
#Make predictions on the validation data
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print(f"Validation MAE: {val_mae}")

Validation MAE: 26867.813698630136


### Test different Max_Leaf_nodes to see which is best for pediction

In [72]:
# Write function to get the MAE:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    pred_vals = model.predict(val_X)
    mae = mean_absolute_error(val_y, pred_vals)
    return(mae)


In [73]:
# Compare different Tree sizes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
#write loop to find ideal tree size from candidate max leaf nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"Max leaf nodes: {max_leaf_nodes} \t MAE: {my_mae}")
    
#Store and return the best value of max_leaf_nodes
scores = {leaf_size: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
print(scores)

Max leaf nodes: 5 	 MAE: 30659.16215009306
Max leaf nodes: 25 	 MAE: 26802.447637047364
Max leaf nodes: 50 	 MAE: 25582.73546745361
Max leaf nodes: 100 	 MAE: 25253.122787569886
Max leaf nodes: 250 	 MAE: 27065.8576397
Max leaf nodes: 500 	 MAE: 27462.759884757557
{5: 27462.759884757557, 25: 27462.759884757557, 50: 27462.759884757557, 100: 27462.759884757557, 250: 27462.759884757557, 500: 27462.759884757557}


In [74]:
#Re-run model with optimal size
iowa_model_2 = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)

# fit the model
iowa_model_2.fit(train_X, train_y)

#Make predictions on the validation data
val_predictions = iowa_model_2.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print(f"Validation MAE: {val_mae}")

#Decreases MAE from 26,867 to 25,253...not so much!

Validation MAE: 25253.122787569886


### Random Forest model

In [65]:
# define model and set random state to 1
rf_model = RandomForestRegressor(random_state=1)

# Fit the model
rf_model.fit(train_X, train_y)

# Calculate MAE of RF model on validation data
rf_pred = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y, rf_pred)

print(f"Validation MAE for RF model: {rf_val_mae}")

Validation MAE for RF model: 19700.585804305283


#### Decreases MAE from 25,253 to 19,700!
#### Next step would be to try other features to see if we can minimize this error. What are some statistical methods for doing this?
#### For now Run with full training dataset to get a more accurate model

In [75]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=1)
rf_model_on_full_data.fit(X,y)

RandomForestRegressor(random_state=1)

In [67]:
# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called house_features
test_X = test_data[house_features]
test_X.describe()

Unnamed: 0,MSSubClass,LotArea,OverallQual,YearBuilt,1stFlrSF,2ndFlrSF,TotRmsAbvGrd
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,57.378341,9819.161069,6.078821,1971.357779,1156.534613,325.967786,6.385195
std,42.74688,4955.517327,1.436812,30.390071,398.16582,420.610226,1.508895
min,20.0,1470.0,1.0,1879.0,407.0,0.0,3.0
25%,20.0,7391.0,5.0,1953.0,873.5,0.0,5.0
50%,50.0,9399.0,6.0,1973.0,1079.0,0.0,6.0
75%,70.0,11517.5,7.0,2001.0,1382.5,676.0,7.0
max,190.0,56600.0,10.0,2010.0,5095.0,1862.0,15.0


In [77]:
test_preds = rf_model_on_full_data.predict(test_X)

In [79]:
output = pd.DataFrame({'ID': test_data.Id, 'SalePrice': test_preds})
output

Unnamed: 0,ID,SalePrice
0,1461,131171.50
1,1462,150941.00
2,1463,151191.67
3,1464,177864.90
4,1465,195205.00
...,...,...
1454,2915,82165.00
1455,2916,81350.00
1456,2917,151466.96
1457,2918,126775.60
