# **Training a RandomForestRegressor**

In [0]:
import math
import time
import itertools
import numpy
import pandas
import sklearn
from sklearn.utils import shuffle	
from sklearn.model_selection import (train_test_split, ShuffleSplit, KFold, RandomizedSearchCV, GridSearchCV)
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import (MinMaxScaler,LabelBinarizer)
from sklearn.decomposition import (FactorAnalysis, PCA, TruncatedSVD, FastICA)
from sklearn.manifold import Isomap
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from pandas.tools.plotting import scatter_matrix
from scipy.stats import (spearmanr, pearsonr)
from scipy.linalg import svd
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from google.colab import files

In [0]:
uploaded = files.upload()
dataset = pandas.read_csv("housing.csv")

## **Preprocessing**

In [0]:
def normalizer(dataset):
	scaler = MinMaxScaler() 
	scaled_values = scaler.fit_transform(dataset) 
	dataset_normalized = pandas.DataFrame(scaled_values)
	dataset_normalized.columns = dataset.columns
	return dataset_normalized

def one_hot_encoder(column):
	ocean_proximity_encoder = LabelBinarizer()
	encoded_class = ocean_proximity_encoder.fit_transform(column.values)
	encoded_class = pandas.DataFrame(data=encoded_class, columns=ocean_proximity_encoder.classes_)
	return encoded_class

def evaluate_model(model, X, Y):
	Y = Y.transpose().values
	predictions = rf.predict(X)
	errors = abs(predictions - Y)
	mae = numpy.mean(errors)
	mape = 100 * (errors / Y)
	accuracy = 100 - numpy.mean(mape)
	return mae, accuracy

def train_model(model, X, Y, base_accuracy):
  print("RandomForestRegressor")                 
  rf = model
  n_folds = 10
  kf = KFold(n_splits=n_folds)
  mae_average = 0
  accuracy_average = 0
  r2_average = 0
  fold_counter = 0
  for train_idx, test_idx in kf.split(X):
    fold_counter += 1
    print("fold", fold_counter)
    X_train = X.iloc[train_idx,:]
    Y_train = Y.iloc[train_idx]
    X_test = X.iloc[test_idx,:]
    Y_test = Y.iloc[test_idx]

    rf.fit(X_train, Y_train.values.ravel())
    mae,accuracy = evaluate_model(rf, X_test, Y_test)
    mae_average += mae/n_folds
    accuracy_average += accuracy/n_folds
    r2_average += rf.score(X_test, Y_test)/n_folds
  print("MAE :", mae_average)
  print("Accuracy :", accuracy_average, "%")
  print("R^2 :", r2_average)
  print("accuracy - base_accuracy :", accuracy_average-base_accuracy, "%")
  return accuracy_average

In [0]:
#One hot encoding of "ocean_proximity"
ocean_proximity = one_hot_encoder(dataset[["ocean_proximity"]])
dataset = dataset.drop('ocean_proximity', axis=1)
dataset[ocean_proximity.columns] = ocean_proximity

In [0]:
#Handling missing values in the "total_bedrooms" column using a LinearRegression learning from the "total_rooms"
notna = dataset.total_bedrooms.notna()

model = linear_model.LinearRegression()
model.fit(dataset.total_rooms.values[notna].reshape(-1,1), dataset.total_bedrooms.values[notna].reshape(-1,1))

isna = dataset.total_bedrooms.isna()

missing_bedrooms = model.predict(dataset.total_rooms.values[isna].reshape(-1,1))
dataset.total_bedrooms.loc[isna] = numpy.squeeze(missing_bedrooms)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [0]:
#first training
dataset = shuffle(dataset)
Y = dataset[["median_house_value"]]
X = dataset.drop(['median_house_value'], axis=1)
rf = RandomForestRegressor()
base_accuracy = train_model(rf, X, Y, 50)

RandomForestRegressor
fold 1




fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
MAE : 33047.393609496125
Accuracy : 81.6510877444245 %
R^2 : 0.8080048546619486
accuracy - base_accuracy : 31.6510877444245 %


In [0]:
#Box Cox transformation to have data normaly distributed
median_house_value_bc, maxlog, interval = stats.boxcox(dataset.median_house_value, alpha=0.05)
population_bc, maxlog, interval = stats.boxcox(dataset.population, alpha=0.05)
housing_median_age_bc, maxlog, interval = stats.boxcox(dataset.housing_median_age, alpha=0.05)
total_rooms_bc, maxlog, interval = stats.boxcox(dataset.total_rooms, alpha=0.05)
total_bedrooms_bc, maxlog, interval = stats.boxcox(dataset.total_bedrooms, alpha=0.05)
households_bc, maxlog, interval = stats.boxcox(dataset.households, alpha=0.05)
median_income_bc, maxlog, interval = stats.boxcox(dataset.median_income, alpha=0.05)

dataset['housing_median_age'] = housing_median_age_bc
dataset['total_rooms'] = total_rooms_bc
dataset['total_bedrooms'] = total_bedrooms_bc
dataset['population'] = population_bc
dataset['households'] = households_bc
dataset['median_income'] = median_income_bc
dataset['median_house_value'] = median_house_value_bc

In [0]:
#second training
dataset = shuffle(dataset)
Y = dataset[["median_house_value"]]
X = dataset.drop(['median_house_value'], axis=1)
rf = RandomForestRegressor()
train_model(rf, X, Y, base_accuracy)

RandomForestRegressor
fold 1




fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
MAE : 0.7405697085263883
Accuracy : 97.35486053065316 %
R^2 : 0.8261153263960028
accuracy - base_accuracy : 15.703772786228654 %


97.35486053065316

In [0]:
#build a training dataset, adding one feature at a time, stop when performance decreases
#performance is evaluated with kfold X-validation and an evaluation function (k and the function are passed in arguments)
#return a list of feature's name
def forward_feature_selection(model, X, Y, evaluation_function, kfold=3):
  print ("Forward feature selection")
  kf = KFold(n_splits=kfold)
  available_features = list(X.columns.values)
  selected_features = []
  best_accuracy = 0
  prediction_improved = True
  while(prediction_improved is True and len(available_features)!=0):
    prediction_improved = False
    for feature in available_features:
      accuracy = 0
      selected_features.append(feature)
      for train_idx, test_idx in kf.split(X):
        X_train = X.iloc[train_idx,:]
        X_train = X_train[selected_features]
        Y_train = Y.iloc[train_idx]
        X_test = X.iloc[test_idx,:]
        X_test = X_test[selected_features]
        Y_test = Y.iloc[test_idx]
        model.fit(X_train, Y_train.values.ravel())
        mae, fold_accuracy = evaluation_function(model, X_test, Y_test)
        accuracy += fold_accuracy
      accuracy = accuracy / kfold
      if(accuracy > best_accuracy):
        prediction_improved = True
        best_accuracy = accuracy
        best_feature = feature
        print ("best_accuracy ", best_accuracy)
        print ("best_feature", best_feature)
      selected_features.remove(feature)
    if(prediction_improved is True):
      selected_features.append(best_feature)
      available_features.remove(best_feature)

  return selected_features

In [0]:
#reducing the dataset dimension using a forward feature selection
dataset = shuffle(dataset)
Y = dataset[["median_house_value"]]
X = dataset.drop(['median_house_value'], axis=1)

rf = RandomForestRegressor()

selected_features = forward_feature_selection(rf, X, Y, evaluate_model, kfold=10)
X = X[selected_features]

Forward feature selection




best_accuracy  94.25402744690851
best_feature longitude
best_accuracy  97.33929506689034
best_feature latitude
best_accuracy  97.4192733819291
best_feature median_income
best_accuracy  97.42118958996991
best_feature <1H OCEAN
best_accuracy  97.4281837285329
best_feature ISLAND
best_accuracy  97.44890191867312
best_feature NEAR BAY


In [0]:
#third training
rf = RandomForestRegressor()
train_model(rf, X, Y, base_accuracy)

RandomForestRegressor
fold 1




fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
MAE : 0.7169430854035516
Accuracy : 97.43922683818718 %
R^2 : 0.8340595971168877
accuracy - base_accuracy : 15.78813909376268 %


97.43922683818718

## **Tuning**

In [0]:
#Train a RandomForestRegressor with different hyperparameters combinations
#These combinations are randomly drawn from defined pools of values
def random_rf_tuning(X, Y):
	# Number of trees in random forest
	n_estimators = [int(x) for x in numpy.linspace(start = 200, stop = 2000, num = 10)]
	# Number of features to consider at every split
	max_features = ['auto', 'sqrt']
	# Maximum number of levels in tree
	max_depth = [int(x) for x in numpy.linspace(10, 110, num = 11)]
	max_depth.append(None)
	# Minimum number of samples required to split a node
	min_samples_split = [2, 5, 10]
	# Minimum number of samples required at each leaf node
	min_samples_leaf = [1, 2, 4]
	# Method of selecting samples for training each tree
	bootstrap = [True, False]

	# Create the random grid
	random_grid = {'n_estimators': n_estimators,
	               'max_features': max_features,
	               'max_depth': max_depth,
	               'min_samples_split': min_samples_split,
	               'min_samples_leaf': min_samples_leaf,
	               'bootstrap': bootstrap}

	rf = RandomForestRegressor()
	# Random search of parameters, using 3 fold cross validation, 
	# search across 100 different combinations, and use all available cores
	rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42)#, n_jobs = -1)
	rf_random.fit(X, Y)
	print(rf_random.best_params_)

In [0]:
random_rf_tuning(X, Y.values.ravel())

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=   7.9s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s remaining:    0.0s


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=   7.7s
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True, total=   7.7s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True, total=  25.9s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True, total=  25.8s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=2000, min_sa

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 176.4min finished


{'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}


In [0]:
#fourth training
rf = RandomForestRegressor(n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features='auto', max_depth=None, bootstrap=True)
train_model(rf, X, Y, base_accuracy)

RandomForestRegressor
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
MAE : 0.6788583333020967
Accuracy : 97.57508295634847 %
R^2 : 0.8492685162951141
accuracy - base_accuracy : 15.923995211923966 %


97.57508295634847

In [0]:
#result of randomized search :
#{'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}

param_grid = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [400, 600, 800]
	}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, verbose = 3)#, n_jobs = -1)
grid_search.fit(X, Y.values.ravel())
grid_search.best_params_
grid_search.best_estimator_

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400, score=0.841038099404525, total=   8.9s
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.1s remaining:    0.0s


[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400, score=0.8453647549439836, total=   8.5s
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.8s remaining:    0.0s


[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=400, score=0.8352616374601725, total=   8.4s
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600 
[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600, score=0.8406810776655015, total=  12.9s
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600 
[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600, score=0.8452562190130477, total=  12.7s
[CV] bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600 
[CV]  bootstrap=True, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=3, n_estimators=600, score=0.835789547359642, total=  12.6s
[CV] bootstrap=True, max_depth=None, max_features=2, 

[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed: 66.2min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=600, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [0]:
#fifth and last training
rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=600, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
train_model(rf, X, Y, base_accuracy)

RandomForestRegressor
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
fold 10
MAE : 0.6788553333107743
Accuracy : 97.57514401314135 %
R^2 : 0.849263820849534
accuracy - base_accuracy : 15.924056268716853 %


97.57514401314135