In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('./melb_data.csv')

# Separate target from predictors
y = data.Price
print(y.head())
X = data.drop(['Price'], axis=1)
print(X.head())

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Drop columns with missing values (simplest approach)
# inplace = If False, return a copy. Otherwise, do operation inplace and return None.
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
# The function nunique counts the number of distinct elements in specified axis.
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
print(X_train_full.columns.nunique())
print("Low Cardinality Columns: ", low_cardinality_cols, [X_train_full[cname].dtype for cname in X_train_full[low_cardinality_cols]])


# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
print("numerical_cols: ", numerical_cols)

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64
       Suburb           Address  Rooms Type Method SellerG       Date   
0  Abbotsford      85 Turner St      2    h      S  Biggin  3/12/2016  \
1  Abbotsford   25 Bloomburg St      2    h      S  Biggin  4/02/2016   
2  Abbotsford      5 Charles St      3    h     SP  Biggin  4/03/2017   
3  Abbotsford  40 Federation La      3    h     PI  Biggin  4/03/2017   
4  Abbotsford       55a Park St      4    h     VB  Nelson  4/06/2016   

   Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  BuildingArea   
0       2.5    3067.0       2.0       1.0  1.0     202.0           NaN  \
1       2.5    3067.0       2.0       1.0  0.0     156.0          79.0   
2       2.5    3067.0       3.0       2.0  0.0     134.0         150.0   
3       2.5    3067.0       3.0       2.0  1.0      94.0           NaN   
4       2.5    3067.0       3.0       1.0  2.0     120.0         142.0   

   YearBuilt 

In [32]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print(s)
print(s[s])
print("Categorical variables:")
print(object_cols)


Type              True
Method            True
Regionname        True
Rooms            False
Distance         False
Postcode         False
Bedroom2         False
Bathroom         False
Landsize         False
Lattitude        False
Longtitude       False
Propertycount    False
dtype: bool
Type          True
Method        True
Regionname    True
dtype: bool
Categorical variables:
['Type', 'Method', 'Regionname']


In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


## Categorical Variables
There are 3 approaches while categorizing the data. These are: 
1. Drop Categorical Variables -> simply removing them from the dataset
2. Ordinal Encoding -> assigning each unique value to a different integer
3. One-Hot Encoding -> creating new columns indicating the presence (or absence) of each possible value in the original data

### Drop Categorical Variables 

In [34]:
dropped_X_train = X_train.select_dtypes(exclude=["object"])
dropped_X_valid = X_valid.select_dtypes(exclude=["object"])

### Ordinal Encoding

In [35]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

ord_encoder = OrdinalEncoder()
label_X_train[object_cols] = ord_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ord_encoder.transform(X_valid[object_cols])


## One-hot Encoding


We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't<br><br>
represented in the training data, and setting sparse=False ensures that the encoded columns are <br><br>
returned as a numpy array (instead of a sparse matrix). <br>


In [36]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# .concat = Concatenate pandas objects along a particular axis
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

### Which approach is best?

In [37]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(dropped_X_train, dropped_X_valid, y_train, y_valid))
print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
print("MAE from Approach 3 (One-Hot Encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
175703.48185157913
MAE from Approach 2 (Ordinal Encoding):
165936.40548390493
MAE from Approach 3 (One-Hot Encoding):
166089.4893009678


### You can check the files day4_1 and day4_2 to learn why we need this categorization strategies.