In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('./data/melb_data.csv')

y = data['Price']
X = data.drop(['Price'], axis = 1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, random_state = 0)

cat_cols = [col_name for col_name in X_train_full.columns
           if  X_train_full[col_name].nunique() < 10
           and X_train_full[col_name].dtype == 'object']

num_cols = [col_name for col_name in X_train_full.columns
           if  X_train_full[col_name].dtype in ['int64', 'float64']]

selected_cols = cat_cols + num_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

In [2]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
664,h,S,Southern Metropolitan,3,9.2,3104.0,3.0,2.0,2.0,368.0,177.0,2009.0,-37.7846,145.0935,7809.0
3270,h,S,Eastern Metropolitan,2,10.5,3081.0,2.0,1.0,2.0,586.0,80.0,1955.0,-37.7435,145.0486,2947.0
3873,h,S,Southern Metropolitan,2,11.2,3145.0,2.0,1.0,1.0,348.0,,,-37.8672,145.0432,8801.0
13170,h,S,Northern Metropolitan,3,19.6,3076.0,3.0,1.0,1.0,521.0,,,-37.63854,145.05179,10926.0
1730,h,S,Southern Metropolitan,4,11.4,3163.0,3.0,2.0,2.0,687.0,237.0,1983.0,-37.8931,145.0479,7822.0


In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_transformer = SimpleImputer(strategy = 'constant')

cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore')),
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols),
    ]
)

In [4]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [5]:
from sklearn.metrics import mean_absolute_error

full_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model),
])

full_pipeline.fit(X_train, y_train)

preds = full_pipeline.predict(X_valid)

score = mean_absolute_error(preds, y_valid)
score

163987.3804899362