In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [33]:
#Read data
train_path = 'C:/Users/alexandros.peratinos/OneDrive - BearingPoint GmbH/Documents/ML/Data/housing_prices/train.csv'
test_path  = 'C:/Users/alexandros.peratinos/OneDrive - BearingPoint GmbH/Documents/ML/Data/housing_prices/test.csv'
train_data = pd.read_csv(train_path)
test_data  = pd.read_csv(test_path)

#Remove rows with missing target, separate target from predictors
train_data.dropna(subset=['SalePrice'], inplace=True)
y = train_data['SalePrice']
train_data.drop(['SalePrice'], axis=1, inplace=True)
X = train_data

#Split
X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(X, y, train_size=0.8, random_state=0)

#Select low cardinal cat-columns
low_card_cols = [cname for cname in X_train_full if X_train_full[cname].dtype == 'object' and X_train_full[cname].nunique() < 10]

#Select numeric cols
num_cols = [cname for cname in X_train_full if X_train_full[cname].dtype in ['int64','float64']]

#Keep selected cols only
my_cols = low_card_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_val = X_val_full[my_cols].copy()
X_test = test_data[my_cols].copy()

#One-hot encode data by using pandas

X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_test = pd.get_dummies(X_test)

X_train, X_val= X_train.align(X_val, join='left', axis=1)
X_train, X_test= X_train.align(X_test, join='left', axis=1)

In [37]:
#Part 1: Build model
my_model_1 = XGBRegressor(random_state=0)
my_model_1.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=0, ...)

In [40]:
#Part 2: Predictions
#Get predictions
predictions_1 = my_model_1.predict(X_val)

In [41]:
#Part 3: Calculate MAE
mae_1 = mean_absolute_error(y_val, predictions_1)
mae_1

18572.31490796233

In [46]:
#Improve model:

#Define model
my_model_2 = XGBRegressor(random_state=0, n_estimators=1000, n_jobs=-1, learning_rate=0.05)
#Fit model
my_model_2.fit(X_train,y_train)
#Get predictions
predictions_2 = my_model_2.predict(X_val)
#Calculate MAE
mae_2 = mean_absolute_error(y_val, predictions_2)
mae_2

16865.91798212757

In [47]:
#Create worse model

#Define model
my_model_3 = XGBRegressor(random_state=0, n_estimators=50, n_jobs=-1, learning_rate=0.5)
#Fit model
my_model_3.fit(X_train, y_train)
#Predict
predictions_3 = my_model_3.predict(X_val)
#Calculate MAE
mae_3 = mean_absolute_error(y_val, predictions_3)
mae_3

20940.03561108733