In [144]:
%matplotlib
import scipy
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, load_breast_cancer, make_moons, make_blobs, make_circles
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
import pandas as pd

import mglearn


def plot_feature_importances(model, dataset):
    n_features = dataset.data.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), dataset.feature_names)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')


def run_random_forest():
    cancer = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(
        cancer.data, cancer.target, random_state=0)
    forest = RandomForestClassifier(n_estimators=10, random_state=0)
    forest.fit(X_train, y_train)
    print('Performance of train data: {:.3f}'.format(forest.score(X_train, y_train)))
    print('Performance of test data: {:.3f}'.format(forest.score(X_test, y_test)))
    plot_feature_importances(forest, cancer)
    

def run_gradient_boosting_classifier():
    '''
    Gradient booster is an ensemble learning, combining weak learners to improve
    results. 
    :return: Returns None.
    '''
    X, y = make_circles(noise=.25, factor=.5, random_state=1)
    # rename the classes blue and red for visualization purposes
    y_named = np.array(['blue', 'red'])[y]
    X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split(
        X, y_named, y, random_state=0
    )
    # Build the gradient boosting model and fit it to the data
    gbrt = GradientBoostingClassifier(random_state=0)
    gbrt.fit(X_train, y_train_named)

    greater_zero = (gbrt.decision_function(X_test) > 0).astype(int)
    pred = gbrt.classes_[greater_zero]
    # print('X_test.shape: {}'.format(X_test.shape))
    # print('Decision function shape: {}'.format(gbrt.decision_function(X_test).shape))
    # print('Decision function: \n{}'.format(gbrt.decision_function(X_test)[:6]))
    # print('Thresholded decision function:\n{}'.format(gbrt.decision_function(X_test) > 0))
    # print('Predictions:\n{}'.format(gbrt.predict(X_test)))
    # print('pred is equal to predictions: {}'.format(
    #  np.all(pred==gbrt.predict(X_test))))
    # decision_function = gbrt.decision_function(X_test)
    # print('Decision function min: {:.2f}'.format(np.min(decision_function)))
    # print('Decision function max: {:.2f}'.format(np.max(decision_function)))
    
    # fig, axes = plt.subplots(1, 2, figsize=(13, 5))
    # mglearn.tools.plot_2d_separator(gbrt, X, ax=axes[0], alpha=.4, fill=True, cm=mglearn.cm2)
    # scores_image = mglearn.tools.plot_2d_scores(gbrt, X, ax=axes[1], alpha=.4, cm=mglearn.ReBl)
    # for ax in axes:
    #     mglearn.discrete_scatter(X_test[:, 0], X_test[:, 1], y_test, markers='^', ax=ax)
    #     mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, markers='o', ax=ax)
    #     ax.set_xlabel('Feature 0')
    #     ax.set_ylabel('Feature 1')
    #     cbar = plt.colorbar(scores_image, ax=axes.tolist())
    #     axes[0].legend(['Test class 0', 'Test class 1', 'Train class 0', 'Train class 1'], ncol=4, loc=(.1, 1.1))
    print('Shape of probabilities: {}'.format(gbrt.predict_proba(X_test).shape))
    # print('Probabilities:') 
    # for item in gbrt.predict_proba(X_test):
    #     print('{}, {}'.format(item[0], item[1]))
    
    
def run_multiclass_GBC():
    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)
    gbrt = GradientBoostingClassifier(learning_rate=.01, random_state=0)
    gbrt.fit(X_train, y_train)
    print('Decision function shape: {}'.format(gbrt.decision_function(X_test).shape))
    print('Decision function:\n{}'.format(gbrt.decision_function(X_test)[:6, :]))
    print('Argmax of decision function: \n{}'.format(np.argmax(gbrt.decision_function(X_test), axis=1)))
    print('Predictions:\n{}'.format(gbrt.predict(X_test)))
    print('Pred = gbrt: {}'.format(
        np.all(gbrt.predict(X_test)==np.argmax(gbrt.decision_function(X_test), axis=1))))
    
    # print('Predicted probs: {}'.format(gbrt.predict_proba(X_test)))
    # print('Sums: {}'.format(gbrt.predict_proba(X_test)[:6].sum(axis=1)))
    # print('Argmax of predicted proba: {}'.format(np.argmax(gbrt.predict_proba(X_test), axis=1)))
    # print('Predictions:\n{}'.format(gbrt.predict(X_test)))
          
def run_linear_SVC():
    cancer = load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(
        cancer.data, cancer.target, random_state=0)
    min_on_training = X_train.min(axis=0)
    range_on_training = (X_train - min_on_training).max(axis=0)
    X_train_scaled = (X_train - min_on_training) / range_on_training
    X_test_scaled = (X_test - min_on_training) / range_on_training
    
    svc = SVC(C=1000).fit(X_train_scaled, y_train)
    print('Performance on train data: {:.3f}'.format(svc.score(X_train_scaled, y_train)))
    print('Performance on test data: {:.3f}\n'.format(svc.score(X_test_scaled, y_test)))


def main():
    run_multiclass_GBC()
    
    
if __name__ == '__main__':
    main()

Using matplotlib backend: Qt5Agg


Decision function shape: (38, 3)
Decision function:
[[-0.52931069  1.46560359 -0.50448467]
 [ 1.51154215 -0.49561142 -0.50310736]
 [-0.52379401 -0.4676268   1.51953786]
 [-0.52931069  1.46560359 -0.50448467]
 [-0.53107259  1.28190451  0.21510024]
 [ 1.51154215 -0.49561142 -0.50310736]]
Argmax of decision function: 
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
Predictions:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
Pred = gbrt: True
