# Random Forest

Random forests or random decision forests are an ensemble learning method for 
classification, regression and other tasks, that operate by constructing a 
multitude of decision trees at training time and outputting the class that 
is the mode of the classes (classification) or mean prediction (regression) of 
the individual trees. 
Random decision forests correct for decision trees' habit of overfitting to 
their training set.
    - From Wikipedia

## For Classification 

### Decision Tree

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

help(DecisionTreeClassifier)

#### Load and Classify the Iris Data

In [None]:
RANDOM_SEED = 13  # fix the seed on each iteration

# Load data
iris = load_iris()
# Slice data
X = iris.data
y = iris.target

# Shuffle
idx = np.arange(X.shape[0])
np.random.shuffle(idx)
X = X[idx]
y = y[idx]

# Create and fit the model with full data
model = DecisionTreeClassifier(max_depth=None)
classifier = model.fit(X, y) # Fit the model to this data
scores = classifier.score(X, y)


print("Decision Tree Performance: {}".format(scores))


In [None]:
import sklearn.model_selection

# Do the cross-validation
sklearn.model_selection.cross_val_score(model, X, y, cv=5)

#### Based on the full training versus the cross-validation, we expect that the full training data model is overfitting the data.

---

### Random Forest


In [None]:
# Notice the Random Forest is from the ensemble set
from sklearn.ensemble import RandomForestClassifier
help(RandomForestClassifier)

In [None]:
RANDOM_SEED = 13  # fix the seed on each iteration

# Load data
iris = load_iris()
# Slice data
X = iris.data
y = iris.target

# Shuffle
idx = np.arange(X.shape[0])
np.random.seed(RANDOM_SEED)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]

# Create and fit the model with full data
model = RandomForestClassifier(n_estimators=10)
classifier = model.fit(X, y) # Fit the model to this data
scores = classifier.score(X, y)


print("Random Forest Performance: {}".format(scores))

#### Notice we have generalized a little better!  

Let's look at the cross-validation for comparison.

In [None]:
import sklearn.model_selection

# Do the cross-validation
sklearn.model_selection.cross_val_score(model, X, y, cv=5)

---

# Look at some visuals of Decision Tree versus Random Forest decision boundaries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

fig=plt.figure(figsize=(10, 30))

from sklearn import clone
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
# Notice the Random Forest is from the ensemble set
from sklearn.ensemble import RandomForestClassifier

# Parameters
n_classes = 3
n_estimators = 30
cmap = plt.cm.RdYlBu
plot_step = 0.02  # fine step width for decision surface contours
plot_step_coarser = 0.5  # step widths for coarse classifier guesses
RANDOM_SEED = 13  # fix the seed on each iteration

# Load data
iris = load_iris()

plot_idx = 1

models = [DecisionTreeClassifier(max_depth=None),
          RandomForestClassifier(n_estimators=n_estimators)]

# Iterate through a pair of features (recall data is actually 4-D)
# This is strictly for visualization later
for pair in ([0,1], [0,2], [0,3], [1,2],[1,3],[2,3]):
    for model in models:
        # We only take the two corresponding features
        X = iris.data[:, pair]
        y = iris.target

        # Shuffle
        idx = np.arange(X.shape[0])
        np.random.seed(RANDOM_SEED)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # Standardize the data to mean = 0 and standard deviation of 1
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X = (X - mean) / std   # look, we are turning the data into Z-scores

        # Train
        clf = clone(model)   # snag a copy of the model from the list created above
        clf = model.fit(X, y) # Fit the model to this data

        scores = clf.score(X, y)
        
        # Create a title for each column and the console by using str() and
        # slicing away useless parts of the string
        model_title = str(type(model)).split(
            ".")[-1][:-2][:-len("Classifier")]

        model_details = model_title
        if hasattr(model, "estimators_"):
            model_details += " with {} estimators".format(
                len(model.estimators_))
        print(model_details + " with features", pair,
              "has a score of", scores)
        print()

        # 6 feature pairs (rows) and 2 models (columns) 
        plt.subplot(6, 2, plot_idx)
        if plot_idx <= len(models):
            # Add a title at the top of each column
            plt.title(model_title)

        # Now plot the decision boundary using a fine mesh as input to a
        # filled contour plot
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))

        # Plot either a single DecisionTreeClassifier or alpha blend the
        # decision surfaces of the ensemble of classifier
        if isinstance(model, DecisionTreeClassifier):
            Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            cs = plt.contourf(xx, yy, Z, cmap=cmap)
        else:
            # Choose alpha blend level with respect to the number
            # of estimators
            # that are in use (noting that AdaBoost can use fewer estimators
            # than its maximum if it achieves a good enough fit early on)
            estimator_alpha = 1.0 / len(model.estimators_)
            for tree in model.estimators_:
                Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
                Z = Z.reshape(xx.shape)
                cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap)

        # Build a coarser grid to plot a set of ensemble classifications
        # to show how these are different to what we see in the decision
        # surfaces. These points are regularly space and do not have a
        # black outline
        xx_coarser, yy_coarser = np.meshgrid(
            np.arange(x_min, x_max, plot_step_coarser),
            np.arange(y_min, y_max, plot_step_coarser))
        Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(),
                                         yy_coarser.ravel()]
                                         ).reshape(xx_coarser.shape)
        cs_points = plt.scatter(xx_coarser, yy_coarser, s=15,
                                c=Z_points_coarser, cmap=cmap,
                                edgecolors="none")

        # Plot the training points, these are clustered together and have a
        # black outline
        plt.scatter(X[:, 0], X[:, 1], c=y,
                    cmap=ListedColormap(['r', 'y', 'b']),
                    edgecolor='k', s=20)
        plot_idx += 1  # move on to the next plot in sequence

plt.suptitle("Classifiers on feature subsets of the Iris dataset")
plt.axis("tight")

plt.show()

# For Feature Selection

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification

# This is another type of Random Forest
from sklearn.ensemble import ExtraTreesClassifier
help(ExtraTreesClassifier)

In [None]:

# Build a classification task using 3 informative features
#   Consult the API for more details
X, y = make_classification(n_samples=1000,    # 1K data point
                           n_features=10,     # 10-D data
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,       # 2 classes
                           random_state=0,
                           shuffle=False)

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

# Fit the model to the data
forest.fit(X, y)

# Pull the feature importances from the model
importances = forest.feature_importances_
# Compute the standard deviation of the feature importance for each feature of all trees
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)

# Sort the data, but get back the list of indices that were sorted with the data
#   not the data itself
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

# Save your Notebook!