In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None
# pd.options.display.max_rows = None

X = pd.read_csv('./input/train.csv')
X_test = pd.read_csv('./input/test.csv')

# delete rows with missing target value, separate targets from predictors
X.dropna(axis=0,subset=['SalePrice'],inplace=True)
y = X['SalePrice']
X.drop(['SalePrice'],axis=1,inplace=True)

# to keep things simple we'll drop all columns with missing values
cols_with_missing_values = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing_values,axis=1,inplace=True)
X_test.drop(cols_with_missing_values,axis=1,inplace=True)

# build training and validation data split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [23]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,ExterQual,ExterCond,Foundation,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageCars,GarageArea,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
618,619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,Norm,Norm,1Fam,1Story,9,5,2007,2007,Hip,CompShg,CemntBd,CmentBd,Ex,TA,PConc,48,0,1774,1822,GasA,Ex,Y,1828,0,0,1828,0,0,2,0,3,1,Gd,9,Typ,1,3,774,Y,0,108,0,0,260,0,0,7,2007,New,Partial
870,871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,PosN,Norm,1Fam,1Story,5,5,1962,1962,Hip,CompShg,MetalSd,MetalSd,TA,TA,CBlock,0,0,894,894,GasA,Gd,N,894,0,0,894,0,0,1,0,2,1,TA,5,Typ,0,1,308,Y,0,0,0,0,0,0,0,8,2009,WD,Normal
92,93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,1Story,5,7,1921,2006,Gable,CompShg,Wd Sdng,Wd Sdng,TA,Gd,BrkTil,713,0,163,876,GasA,Ex,Y,964,0,0,964,1,0,1,0,2,1,TA,5,Typ,0,2,432,Y,0,0,44,0,0,0,0,8,2009,WD,Normal
817,818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,Norm,Norm,1Fam,1Story,8,5,2002,2002,Hip,CompShg,CemntBd,CmentBd,Gd,TA,PConc,1218,0,350,1568,GasA,Ex,Y,1689,0,0,1689,1,0,2,0,3,1,Gd,7,Typ,2,3,857,Y,150,59,0,0,0,0,0,7,2008,WD,Normal
302,303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,Norm,Norm,1Fam,1Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,Gd,TA,PConc,0,0,1541,1541,GasA,Ex,Y,1541,0,0,1541,0,0,2,0,3,1,Gd,6,Typ,1,3,843,Y,468,81,0,0,0,0,0,1,2006,WD,Normal


In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for testing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

In [30]:
# drop all categorical columns
X_train_numeric = X_train.select_dtypes(exclude=['object'])
X_valid_numeric = X_valid.select_dtypes(exclude=['object'])

In [31]:
print('MAE for Approach 1 (Drop categorical columns):')
score_dataset(X_train_numeric, X_valid_numeric, y_train, y_valid)

MAE for Approach 1 (Drop categorical columns):


17952.591404109586

In [39]:
# All categorical columns
object_cols = X_train.select_dtypes(['object']).columns.tolist()

# Columns safe for label encoding
good_label_cols = [col for col in X_train.select_dtypes(['object']).columns if set(X_train[col])==set(X_valid[col])]

# columns unsafe for label encoding; to be removed from dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
print('good label columns:',good_label_cols)
print('bad label columns: ',bad_label_cols)

good label columns: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'HouseStyle', 'ExterQual', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleCondition']
bad label columns:  ['SaleType', 'Neighborhood', 'Utilities', 'Exterior2nd', 'HeatingQC', 'Foundation', 'Functional', 'Condition2', 'Condition1', 'RoofMatl', 'LandSlope', 'RoofStyle', 'ExterCond', 'Exterior1st', 'Heating']


In [40]:
from sklearn.preprocessing import LabelEncoder

# drop categorical columns that are unsafe for label encoding
label_X_train = X_train.drop(bad_label_cols,axis=1)
label_X_valid = X_valid.drop(bad_label_cols,axis=1)

# apply label encoding
label_encoding = LabelEncoder()

for col in good_label_cols:
    label_X_train[col] = label_encoding.fit_transform(label_X_train[col])
    label_X_valid[col] = label_encoding.transform(label_X_valid[col])

In [41]:
print("MAE from Approach 2 (Label Encoding):") 
score_dataset(label_X_train, label_X_valid, y_train, y_valid)

MAE from Approach 2 (Label Encoding):


17675.942500000005

In [58]:
object_nunique = list(map(lambda col: X_train[col].nunique(),object_cols))
d = list(zip(object_cols,object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d, key=lambda x: x[1])

[('Street', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('PavedDrive', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('ExterQual', 4),
 ('KitchenQual', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9),
 ('Exterior1st', 15),
 ('Exterior2nd', 16),
 ('Neighborhood', 25)]

In [63]:
low_cardinality_cols = [c for c,v in d if v<10]
print(len(low_cardinality_cols))

24


In [52]:
?sorted

[1;31mSignature:[0m [0msorted[0m[1;33m([0m[0miterable[0m[1;33m,[0m [1;33m/[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mreverse[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return a new list containing all items from the iterable in ascending order.

A custom key function can be supplied to customize the sort order, and the
reverse flag can be set to request the result in descending order.
[1;31mType:[0m      builtin_function_or_method


In [65]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
?OneHotEncoder

[1;31mInit signature:[0m
[0mOneHotEncoder[0m[1;33m([0m[1;33m
[0m    [0mn_values[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcategorical_features[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcategories[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdrop[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msparse[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mdtype[0m[1;33m=[0m[1;33m<[0m[1;32mclass[0m [1;34m'numpy.float64'[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mhandle_unknown[0m[1;33m=[0m[1;34m'error'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Encode categorical integer features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
encoding scheme. This 