In [43]:
import pandas as pd

melbourne_data = pd.read_csv('melb_data.csv')
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [44]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

Deleting missing values from the data

In [45]:
melbourne_data = melbourne_data.dropna(axis=0)
#Same as above but with inplace=True
#melbourne_data.dropna(axis=0, inplace=True)

In [51]:
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X= melbourne_data[melbourne_features]

In [58]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [59]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=0)
model.fit(train_X, train_y)

DecisionTreeRegressor(random_state=0)

In [60]:
from sklearn.metrics import mean_absolute_error
val_predictions = model.predict(val_X)
mean_absolute_error(val_y, val_predictions)

251520.819883796

In [63]:
from sklearn.metrics import mean_absolute_error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model_mae = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model_mae.fit(train_X, train_y)
    preds_val = model_mae.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [64]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  369673
Max leaf nodes: 50  		 Mean Absolute Error:  266644
Max leaf nodes: 500  		 Mean Absolute Error:  243613
Max leaf nodes: 5000  		 Mean Absolute Error:  256227


In the below code, I tried to apply n_jobs parameter to the model. It dramatically reduces the time taken to run the model. -1 means all the jobs will be in parallel. In default, it is 1.

In [70]:
import time
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1,n_jobs=-1)
start = time.time()
forest_model.fit(train_X, train_y)
predictions = forest_model.predict(val_X)
end = time.time()
print("Time taken for training: ", end - start)
print(mean_absolute_error(val_y, predictions))
# 0.9250147342681885
# 0.2753736972808838

Time taken for training:  0.2650489807128906
190414.59149026


Up to this point we used values without missing values. Now we will use values with missing values.
Still we will only use numeric values.

In [84]:
data = pd.read_csv('melb_data.csv')
y = data.Price
# Numeric predictors. We first drop the price then drop the non-numeric columns.
X = data.drop(['Price'], axis=1).select_dtypes(exclude=['object'])
# Train test split of data.
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1, train_size=0.8, test_size=0.2)

pandas.core.frame.DataFrame

In [80]:
def score_dataset(X_train, y_train, X_val, y_val):
    numeric_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
    numeric_model.fit(X_train, y_train)
    preds_val = numeric_model.predict(X_val)

    return mean_absolute_error(y_val, preds_val)

In [81]:
# Get names of columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
# Drop columns in training and validation data
reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_val = val_X.drop(cols_with_missing, axis=1)
# Print score from above function
score_dataset(reduced_X_train, y_train, reduced_X_val, val_y)

187082.57548478153

In [89]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val))

imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

score_dataset(imputed_X_train, y_train, imputed_X_val, val_y)


<class 'pandas.core.frame.DataFrame'>


174917.69150711832

In [90]:
X_train_plus = X_train.copy()
X_val_plus = X_val.copy()

# For every column with missing values, we create another columd filled with True and False. True is where the missing value is present.
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_val_plus[col + '_was_missing'] = X_val_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_val_plus = pd.DataFrame(my_imputer.transform(X_val_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_val_plus.columns = X_val_plus.columns

score_dataset(imputed_X_train_plus, y_train, imputed_X_val_plus, val_y)

177346.30320324007