In [1]:
# Imports

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Read the data
X_full = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

# Separate target from predictors
y = X_full.SalePrice
X = X_full.drop(['SalePrice'], axis=1)

# Divide data into training and validation sets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y,
                                                                train_size=0.8, test_size=0.2,
                                                               random_state=0)

# Select categorical columns for preprocessing in training data
categorical_cols = [col for col in X_train_full.columns
                   if X_train_full[col].dtype in ['object']]

# Select numerical columns for preprocessing in training data
numerical_cols = [col for col in X_train_full.columns
                  if X_train_full[col].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()


### Preprocessing steps

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data: filling missing values
numerical_transformer = SimpleImputer()

# Preprocessing for categorical data: filling missing values & encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numerical_transformer, numerical_cols),
                    ('cat', categorical_transformer, categorical_cols)
                ])




### Defining the model

In [4]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=300, learning_rate=0.1)

### Creating and evaluating the Pipeline

In [5]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and model in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                           ])

# Fit the model
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, making predictions
predictions = pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, predictions)
print('MAE: ', score)

MAE:  15498.155153039384


### Make test predictions

In [6]:
# Preprocessing of test data, fit model
preds_test = pipeline.predict(X_test)


# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)