In [28]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # sum(y_real - y_predicted) / n
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
# Interface Settings
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:98% !important; margin-left:1% !important; margin-right:auto !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows', 70)

import warnings
warnings.filterwarnings("ignore")

In [4]:
home_data_file_path = 'train.csv'
home_data = pd.read_csv(home_data_file_path) 

In [7]:
# Separate target from predictors
y = home_data.SalePrice
X = home_data.drop(['SalePrice'], axis=1)

In [8]:
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [9]:
# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [43]:
# Select categorical columns with relatively low cardinality
low_cardinality_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == "object"]

In [44]:
# Select numerical columns
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

In [45]:
selected_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

In [46]:
X_train.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,BldgType,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,1Fam,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,1Fam,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,1Fam,...,843,468,81,0,0,0,0,0,1,2006


In [47]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
obj_cols = list(s[s].index)
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [48]:
X_train[obj_cols].nunique().sort_values()

Street           2
Utilities        2
CentralAir       2
LandSlope        3
PavedDrive       3
LotShape         4
LandContour      4
KitchenQual      4
ExterQual        4
MSZoning         5
LotConfig        5
BldgType         5
HeatingQC        5
ExterCond        5
Functional       6
Heating          6
RoofStyle        6
Condition2       6
Foundation       6
SaleCondition    6
RoofMatl         7
HouseStyle       8
SaleType         9
Condition1       9
dtype: int64

In [49]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Score from Approach 1 (Drop Categorical Variables)

In [50]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17952.591404109586


## Score from Approach 2 (Label Encoding)

In [51]:
# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

In [52]:
# Apply label encoder to each column with categorical data
for col in obj_cols:
    label_X_train[col] = LabelEncoder().fit_transform(X_train[col])
    label_X_valid[col] = LabelEncoder().fit_transform(X_valid[col])

In [53]:
print("MAE from Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Label Encoding):
17596.74551369863


In [54]:
label_X_train.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,BldgType,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,3,1,3,3,0,4,0,2,2,0,...,774,0,108,0,0,260,0,0,7,2007
870,3,1,3,3,0,4,0,4,2,0,...,308,0,0,0,0,0,0,0,8,2009
92,3,1,0,1,0,4,0,2,2,0,...,432,0,0,44,0,0,0,0,8,2009
817,3,1,0,3,0,1,0,2,2,0,...,857,150,59,0,0,0,0,0,7,2008
302,3,1,0,3,0,0,0,2,2,0,...,843,468,81,0,0,0,0,0,1,2006
