In [55]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # sum(y_real - y_predicted) / n
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

In [2]:
# Interface Settings
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:98% !important; margin-left:1% !important; margin-right:auto !important;}</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_rows', 70)

import warnings
warnings.filterwarnings("ignore")

In [4]:
home_data_file_path = 'train.csv'
home_data = pd.read_csv(home_data_file_path) 

In [7]:
# Separate target from predictors
y = home_data.SalePrice
X = home_data.drop(['SalePrice'], axis=1)

In [8]:
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [9]:
# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

In [83]:
# Select categorical columns with relatively low cardinality
low_cardinality_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == "object"]

In [84]:
# Select numerical columns
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]

In [85]:
selected_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

In [86]:
X_train.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,BldgType,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,1Fam,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,1Fam,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,1Fam,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,1Fam,...,843,468,81,0,0,0,0,0,1,2006


In [87]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
obj_cols = list(s[s].index)
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [88]:
X_train[obj_cols].nunique().sort_values()

Street           2
Utilities        2
CentralAir       2
LandSlope        3
PavedDrive       3
LotShape         4
LandContour      4
KitchenQual      4
ExterQual        4
MSZoning         5
LotConfig        5
BldgType         5
HeatingQC        5
ExterCond        5
Functional       6
Heating          6
RoofStyle        6
Condition2       6
Foundation       6
SaleCondition    6
RoofMatl         7
HouseStyle       8
SaleType         9
Condition1       9
dtype: int64

In [89]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Score from Approach 1 (Drop Categorical Variables)

In [90]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17952.591404109586


## Score from Approach 2 (Label Encoding)

In [91]:
# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

In [92]:
# Apply label encoder to each column with categorical data
for col in obj_cols:
    label_X_train[col] = LabelEncoder().fit_transform(X_train[col])
    label_X_valid[col] = LabelEncoder().fit_transform(X_valid[col])

In [93]:
print("MAE from Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Label Encoding):
17596.74551369863


## Score from Approach 3 (One-Hot Encoding)¶

In [94]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[obj_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[obj_cols]))

In [95]:
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

In [96]:
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(obj_cols, axis=1)
num_X_valid = X_valid.drop(obj_cols, axis=1)

In [97]:
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [98]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
17514.224246575344


In [129]:
test_data_path = 'test.csv'
test_data = pd.read_csv(test_data_path)
test_X_ = test_data.select_dtypes(include=['object'])
test_X = test_data.copy()
test_X['BsmtFinSF1'].fillna((test_X['BsmtFinSF1'].mean()), inplace=True)
test_X['BsmtFinSF2'].fillna((test_X['BsmtFinSF2'].mean()), inplace=True)
test_X['BsmtUnfSF'].fillna((test_X['BsmtUnfSF'].mean()), inplace=True)
test_X['TotalBsmtSF'].fillna((test_X['TotalBsmtSF'].mean()), inplace=True)
test_X['BsmtFullBath'].fillna((test_X['BsmtFullBath'].mean()), inplace=True)
test_X['BsmtHalfBath'].fillna((test_X['BsmtHalfBath'].mean()), inplace=True)
test_X['GarageCars'].fillna((test_X['GarageCars'].mean()), inplace=True)
test_X['GarageArea'].fillna((test_X['GarageArea'].mean()), inplace=True)

In [102]:
# Columns that can be safely label encoded
good_label_cols = [col for col in obj_cols if 
                   set(X_train[col]) == set(X_valid[col]) == set(test_X_[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(obj_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be label encoded: ['Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'ExterQual', 'CentralAir', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['LandSlope', 'Condition1', 'Foundation', 'RoofStyle', 'Heating', 'Condition2', 'ExterCond', 'HouseStyle', 'MSZoning', 'Functional', 'KitchenQual', 'HeatingQC', 'SaleType', 'RoofMatl', 'Utilities']


In [103]:
# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)


In [105]:
# Apply label encoder 

for col in set(good_label_cols):
    label_X_train[col] = LabelEncoder().fit_transform(X_train[col])
    label_X_valid[col] = LabelEncoder().fit_transform(X_valid[col])

In [106]:
print("MAE from Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Label Encoding):
17691.3723630137


In [113]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in good_label_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(obj_cols)-set(low_cardinality_cols))

In [114]:
print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'ExterQual', 'CentralAir', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['LandSlope', 'Condition1', 'Foundation', 'RoofStyle', 'Heating', 'Condition2', 'ExterCond', 'HouseStyle', 'MSZoning', 'Functional', 'KitchenQual', 'HeatingQC', 'SaleType', 'RoofMatl', 'Utilities']


In [136]:
# Use as many lines of code as you need!
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols])) 
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))  
OH_cols_test_X = pd.DataFrame(OH_encoder.transform(test_X[low_cardinality_cols]))  
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train[low_cardinality_cols].index
OH_cols_valid.index = X_valid[low_cardinality_cols].index
OH_cols_test_X.index = test_X[low_cardinality_cols].index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(obj_cols, axis=1)
num_X_valid = X_valid.drop(obj_cols, axis=1)
num_test_X = test_X.drop(obj_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_test_X = pd.concat([num_test_X, OH_cols_test_X], axis=1)

In [119]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):
17827.654280821916


In [144]:
len(list(OH_X_train.columns))

69

In [146]:
len(list(OH_test_X.columns))

91

In [147]:
len(list(OH_X_valid.columns))

69

In [148]:
OH_test_X = OH_test_X[list(OH_X_train.columns)]

In [124]:
max_estimators = range(50, 266, 2)

In [125]:
def get_mae(n, X_train, X_valid, y_train, y_valid, n_jobs=8, depth=None):
    model = RandomForestRegressor(n_estimators=n, criterion='mae', random_state=0,n_jobs=-1)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return(mae)

In [126]:
# compare MAE with differing values of n_estimators
for n in max_estimators:
    my_mae = get_mae(n, OH_X_train, OH_X_valid, y_train, y_valid)
    print(F"MAX n_estimators: {n}\t Mean Absolute Error: {my_mae}")


MAX n_estimators: 50	 Mean Absolute Error: 18249.926369863013
MAX n_estimators: 52	 Mean Absolute Error: 18213.958574815595
MAX n_estimators: 54	 Mean Absolute Error: 18191.50336123795
MAX n_estimators: 56	 Mean Absolute Error: 18133.511619373778
MAX n_estimators: 58	 Mean Absolute Error: 18186.53601794993
MAX n_estimators: 60	 Mean Absolute Error: 18078.4151826484
MAX n_estimators: 62	 Mean Absolute Error: 18101.633451171012
MAX n_estimators: 64	 Mean Absolute Error: 18078.75353167808
MAX n_estimators: 66	 Mean Absolute Error: 18017.93622872561
MAX n_estimators: 68	 Mean Absolute Error: 17997.85228646253
MAX n_estimators: 70	 Mean Absolute Error: 17953.153767123287
MAX n_estimators: 72	 Mean Absolute Error: 17873.847222222226
MAX n_estimators: 74	 Mean Absolute Error: 17865.33233061829
MAX n_estimators: 76	 Mean Absolute Error: 17825.47999279019
MAX n_estimators: 78	 Mean Absolute Error: 17830.463382507904
MAX n_estimators: 80	 Mean Absolute Error: 17868.946875
MAX n_estimators: 82	 M

KeyboardInterrupt: 

In [150]:
model = RandomForestRegressor(n_estimators=148, criterion='mae', random_state=0)
model.fit(OH_X_train, y_train)
test_preds = model.predict(OH_test_X)

output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds})
output.to_csv('submission_forest.csv', index=False)

RandomForestRegressor(criterion='mae', n_estimators=148, random_state=0)