In [34]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [37]:
# Load the data
X = pd.read_csv('home_data/train.csv', index_col='Id')
X_test_full = pd.read_csv('home_data/test.csv', index_col='Id')

# Remove rows with missing SalePrice, identify SalePrice as target
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Separate validation data from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25)


# Identify categorical columns with relatively low number of unique values
low_cardinality_cols = [colname for colname in X_train_full.columns if X_train_full[colname].nunique() < 10 and X_train_full[colname].dtype == "object"]

# Identify numeric columns
numeric_cols = [colname for colname in X_train_full.columns if X_train_full[colname].dtype in ['int64', 'float64']]

# Keep identified columns only
idd_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[idd_cols].copy()
X_valid = X_valid_full[idd_cols].copy()
X_test = X_test_full[idd_cols].copy()

# One-hot encode the data 
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [43]:
# Define, fit, and see calculate predictors from the model
reg_model_1 = XGBRegressor()
reg_model_1.fit(X_train, y_train)
predictions_1 = reg_model_1.predict(X_valid)
# Calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)
print("Mean Absolute Error:" , mae_1)


Mean Absolute Error: 17399.152932363013


In [46]:
# Refining model by specifying parameters
reg_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5)
reg_model_2.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
predictions_2 = reg_model_2.predict(X_valid)
mae_2 = mean_absolute_error(predictions_2, y_valid)
print("Mean Absolute Error:" , mae_2)

Mean Absolute Error: 15326.88935145548
