In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
# X = pd.DataFrame(data=X, columns=init_data['feature_names'])
# y = pd.DataFrame(data=y, columns=['label'])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train a RandomForestClassifier as model 
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_pred)/X.shape[1]))

Accuracy: 0.98
Accuracy per feature: 0.03


In [2]:
import numpy as np
from sklearn.feature_selection import SelectFromModel
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

In [3]:
importances = forest.feature_importances_

# threshold_val = min([im for im in importances if im > np.mean(importances) + 11* np.var(importances)])
threshold_val = min(sorted(importances)[::-1][:2])
# put prefitted RandomForestClassifier "forest" into SelectFromModel
sfm = SelectFromModel(forest, threshold=threshold_val , prefit=True)
# Z_forest_alt = sfm.transform(X)
# Train the selector
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [4]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(criterion='entropy',
                                n_estimators=200, 
                                random_state=1,
                                n_jobs=2)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=2,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [5]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_important_pred))
print('Accuracy per feature: %.2f' % (accuracy_score(y_test, y_important_pred)/X_important_train.shape[1]))

Accuracy: 0.89
Accuracy per feature: 0.44
