In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [46]:
#Import and read data
train_path = 'C:/Users/alexandros.peratinos/OneDrive - BearingPoint GmbH/Documents/ML/Data/housing_prices/train.csv'
test_path  = 'C:/Users/alexandros.peratinos/OneDrive - BearingPoint GmbH/Documents/ML/Data/housing_prices/test.csv'
X_full = pd.read_csv(train_path)
X_test_full  = pd.read_csv(test_path)

#Remove rows with missing y-val, separate y from x
X_full.dropna(axis=0, subset = 'SalePrice', inplace = True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

#Split
X_train_full, X_val_full, y_train, y_val = train_test_split(X_full, y, train_size=0.8, random_state=0)

#Select cat-columns with low cardinality
cat_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object' and X_train_full[cname].nunique() < 10]

#Select numerical columns
num_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64','float64']]

#Only keep selected columns
my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_val   = X_val_full[my_cols].copy()
X_test  = X_test_full[my_cols].copy()

In [48]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [63]:
#Preprocessing numerical data, impute
num_trans = SimpleImputer(strategy='constant')

#Preprocessing categorical data
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#Bundle together
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_trans, num_cols),
        ('cat', cat_trans, cat_cols)
    ])

#Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

#Bundle preprocessing and model in pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

#Preprocessing of training data, fit model
clf.fit(X_train, y_train)

#Preprocessing of validation data, get predictions
preds = clf.predict(X_val)

print("MAE:", mean_absolute_error(y_val,preds))

MAE: 17740.290308219177


In [80]:
#Attempt to improve performance
#Preprocessing numerical data, impute
num_trans = SimpleImputer(strategy='mean')

#Preprocessing categorical data
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#Bundle together
preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_trans, num_cols),
        ('cat', cat_trans, cat_cols)
    ])

#Define model
model = RandomForestRegressor(n_estimators=200, random_state=0)

#Bundle preprocessing and model in pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

#Preprocessing of training data, fit model
clf.fit(X_train, y_train)

#Preprocessing of validation data, get predictions
preds = clf.predict(X_val)

print("MAE:", mean_absolute_error(y_val,preds))

MAE: 17390.361386986304


In [84]:
#Predict on test data
preds_test = clf.predict(X_test)
preds_test

array([125213.2  , 153152.625, 183518.36 , ..., 150729.875, 110412.115,
       226500.67 ])