In [1]:
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [7]:
df = pd.read_csv("ml_house_data_set.csv")

df.drop(['house_number', 'unit_number', 'street_name', 'zip_code'], axis=1, inplace=True)

features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

X = features_df.drop('sale_price',axis=1)
y = df['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
model = ensemble.GradientBoostingRegressor()

param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

gs_cv = GridSearchCV(model, param_grid, n_jobs=4)

In [None]:
gs_cv.fit(X_train, y_train)
print(gs_cv.best_params_)

In [None]:
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)