In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Read the Data

In [2]:
X_full = pd.read_csv("./data/train.csv", index_col='Id')
X_test_full = pd.read_csv("./data/test.csv", index_col='Id')
print(f"X_full shape: {X_full.shape}")
print(f"X_test_full shape: {X_test_full.shape}")

X_full shape: (1460, 80)
X_test_full shape: (1459, 79)


In [3]:
# columns name
X_full.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [4]:
# target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()
print(f"X shape: {X.shape}")
print(f"X_test shape: {X_test.shape}")

X shape: (1460, 7)
X_test shape: (1459, 7)


In [5]:
# break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)

In [6]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


## Random forest models

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
# define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

### Best model

In [9]:
from sklearn.metrics import mean_absolute_error

In [13]:
# function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)


for idx, model in enumerate(models):
    mae = score_model(model)
    print(f"model_{idx + 1} MAE: {mae:.2f}")

model_1 MAE: 24015.49
model_2 MAE: 23740.98
model_3 MAE: 23528.78
model_4 MAE: 23996.68
model_5 MAE: 23706.67


The best model is model_3

In [14]:
my_model = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
my_model.fit(X, y)
preds_test = my_model.predict(X_test)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv("./data/submission.csv", index=False)

In [15]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,119433.08
1,1462,158367.5
2,1463,185351.21
3,1464,178343.12
4,1465,192898.29
