In [42]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [43]:
test_data_path = './data/test.csv'

In [44]:
test_data = pd.read_csv(test_data_path)

In [45]:
train_data_path = './data/train.csv'

In [46]:
train_data = pd.read_csv(train_data_path)

In [47]:
# numerical_features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond']

In [48]:
# Select numerical columns
numerical_cols = [col for col in train_data.columns 
                  if train_data[col].dtype in ['int64', 'float64']
                  and col != 'SalePrice']

In [49]:
# categorical_features = ['ExterQual', 'ExterCond', 'Heating', 'Electrical']

In [50]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [col for col in train_data.columns 
                    if train_data[col].nunique() < 10 
                    and train_data[col].dtype == "object"]

In [51]:
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

In [52]:
y = train_data.SalePrice
X = train_data[numerical_features + categorical_features]

In [53]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=1)

In [54]:
X_test = test_data[numerical_features + categorical_features]

In [55]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

In [56]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [57]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transformer', numerical_transformer, numerical_features),
        ('categorical_transformer', categorical_transformer, categorical_features)
    ])

In [58]:
model = RandomForestRegressor(n_estimators=100, random_state=1)

In [59]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor), 
        ('model', model)
    ])

In [60]:
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical_transformer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['LotArea', 'YearBuilt',
                                                   '1stFlrSF', '2ndFlrSF',
                                                   'Fu

In [61]:
# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

In [62]:
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 18677.731094585783


In [63]:
# Preprocessing of test data, fit model
preds_test = clf.predict(X_test)

In [64]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)