In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

# ___Random Forests___
--------------

In [1]:
# A widely used and effective ML models are called ensambles.
# Ensambles are made of combination of multiple individual models.
# This aggregate models are more powerful than any of the constituent models alone.

In [2]:
# The efficiency comes from articulating multiple models into an aggregate.
# Even though the individual models may show good performance, they are prone to specific shortcomings.
# They may overfit/underfit certain subsets/features of data.
# By combining multiple models and averaging them, we reduce the risk of such shortcomings.

In [3]:
# Random forests are an ensemble of decision trees.
# Can be used for classification and regression.

# Since decision trees are prone to overfitting data, combining multiple decision trees help mitigate this problem.
# Multiple decision trees, in synergy show better generalizing capacity.
# Trees that make up the ensemble must be diverse, during tree building via random variation.

In [4]:
# Random variation is introduced into tree building in two ways.
# 1) The data used to train each tree is chosen randomly.
# 2) The features chosen for training each tree are also randomly selected.

In [5]:
# To create a random forest model, you first decide on how many trees to build.
# Each tree will be built from a different random sample of the data.
# These samples are called the bootstrap samples.

# If we have a training dataset with n records,
# A bootstrap sample is created by randomly selecting n number of records from the training set WITH REPLACEMENT.
# The size of the bootstrap sample is identical to the original training set (in contrast to k fold cross validation)
# These samples may missing some rows from the original training set and may have multiple occurrences of a few records.

# In contrast to decision trees where the deterministic feature is evaluated from all available features at nodes,
# trees in random forests use a random subset of total features to evaluate the decisive feature at node.

# These randomizations virtually gurantee that decision trees in random forests won't be identical.
# random forest models are quite sensitive to the number of randomly chosen features (n) to be considered at nodes.
# n = 1 leads to trees with branches made based on single features at nodes -> diverse, more complex trees. 
# n almost equal to the number of total features -> forests with similar and simpler trees. (less diversity)

In [6]:
# Random forests make predictions by first having every tree make a prediction for a given data point.
# For regression, the overall prediction is the mean of the predictions made by all trees.
# For classification, the overall prediction is based on weighted vote.

In [None]:
# Random forests, similar to decision trees does not require scaling or other feature preprocessings.

In [8]:
bcancer = load_breast_cancer()
train_x, test_x, train_y, test_y = train_test_split(bcancer.data, bcancer.target, train_size = 0.8)

In [9]:
train_x.shape

(455, 30)

In [31]:
rfClassifier = RandomForestClassifier(n_estimators = 250, max_depth = 5, min_samples_leaf = 5, max_features = 8, n_jobs = -1).fit(train_x, train_y)

In [32]:
rfClassifier.score(test_x, test_y)

0.9473684210526315