In [13]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

cancer_data = load_breast_cancer() # instantiate breast cancer dataset as 'cancer_data'
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names']) # create DataFrame 'df' from 'cancer_data', passing 'data'  as the data, 'feature_names' as the column labels
df['target'] = cancer_data['target'] # create new column 'target' in 'df', passing 'target' values from 'cancer_data'

# features and target selection
X = df[cancer_data.feature_names].values # select the features by passing 'feature_names' from 'cancer_data'
y = df['target'].values # select the target

print('data dimensions:', X.shape) # 569 datapoints and 30 features


# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99) # set 'random_state' set to hold the split

print('training data dimensions:', X_train.shape)


data dimensions: (569, 30)
training data dimensions: (426, 30)


In [14]:
# inititiate random forest model
rf = RandomForestClassifier(random_state=93) # default settings: 100 trees, bootstrapped samples (with replacement), sub-sample size for each tree same size as original sample (426 datapoints in this case), square root of total number of features considered at each split ( √30 in this case). 'random_state' holds the randomness of the bootstrapping and the feature sampling at each split

# fit random forest 
rf.fit(X_train, y_train) # passing training features and targets

# make prediction 
print("prediction:", rf.predict(X_test[0:5])) # passing first five testing datapoints (must be passed as a [list] to make 2d) to 'rf.predict'
print("true value:", y_test[0:5]) # first five testing targets (actual values)
print()
# first 5 predictions correct (0 = malignant, 1 = benign)


prediction: [1 1 1 1 1]
true value: [1 1 1 1 1]



In [15]:
# evaluate 
print("random forest accuracy:", rf.score(X_test, y_test)) # passing all the testing datapoints and targets to get accuracy score


# compare to decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print("decision tree accuracy:", dt.score(X_test, y_test))
print()
# we see that the accuracy score for random forest is better than for decision tree


random forest accuracy: 0.965034965034965
decision tree accuracy: 0.9300699300699301



In [16]:
# tuning the random forest
param_grid = {'n_estimators': [25, 50, 100], # number of trees
			  'max_features': [5, 15, 30]} # number of features considered at each split
			   # list of hyperparameters we want to tune and values we want to try for each one. (These are parameters for 'RandomForestClassifier()'
gs = GridSearchCV(rf, param_grid, cv=3) # instantiate grid search object. pass random forest model, 'param_grid', 'cv=3' number of folds for kfold cross validation
gs.fit(X, y) # fit grid search with features and targets. Grid search will build models for every possible combination of parameters. 3 'n_estimators' values * 3 'max_features' values  = 9 models. It will do 3 fold cross validation to find accuracy score for each model
print("best params:", gs.best_params_) # show winning model
print("best score:", gs.best_score_) # accuracy score for winning model
print()

# We should really test more values per parameter and do more than 3 folds for cross validation, but SoloLearn doesn't have enough computing power! 

# Increasing number of trees in the forest ('n_estimators') will never hurt performance: it will increase performance and then level out. However, it will add complexity which is more resource intensive. We look for the sweet spot: minimum number of trees that optimizes performance without adding unnecssary complexity. For this we use an elbow graph.

# *elbow graph using matplotlib* (not shown)


best params: {'max_features': 5, 'n_estimators': 25}
best score: 0.9648751508400633



In [19]:
# feature selection via importance

# The breast cancer dataset has 30 features. Which subset of features should we use to build a model? We can choose by looking at mean decrease impurity. For a tree, it can be computed how much impurity each feature decreases in a tree. And then for a forest, the impurity decrease from each feature can be averaged. Higher mean impurity decrease = more important feature.

# 'feature_importances_' variable in 'RandomForestClassifier' shows feature importance scores: mean decrease impurity scores scaled down so that the sum of the scores of all features = 1.  

# create a pandas Series (1d labeled arrray) of feature importances, ranked from most to least important. passing 'feature_importances_' scores from 'rf', the previously initiated and fit RandomForestClassifier. passing 'feature_names' from 'cancer_data' as the index (labels). 'sort_values' puts the scores in order, 'ascending=False' means they will be in descending order 
print('most important features:')
print(feat_imp_rank.head(5)) # show top 5 most important features and their feature importance scores
print()


most important features:
worst concave points    0.136366
mean concave points     0.116518
worst perimeter         0.101182
worst radius            0.095643
worst area              0.090810
dtype: float64



In [20]:
# we happen to notice that features with "worst" seem to have higher importances. So let's build a new model with features whose names include the word worst.

feat_with_worst = [feat for feat in df.columns if 'worst' in feat] # creating a list selecting features with 'worst' in the name from 'df.columns' labels
print(feat_with_worst) # 10 features
print()

X_with_worst = df[feat_with_worst] # new feature selection passing 'feat_with_worst' list to the DataFrame
X_train, X_test, y_train, y_test = train_test_split(X_with_worst, y, random_state=99) # train/test split passing selected features 'X_with_worst', same targets 'y' and 'random_state' as previous split

rf.fit(X_train, y_train) # fitting new model 
print('random forest w/ 10 features accuracy:', rf.score(X_test, y_test)) # accuracy of new model 

# We see improved accuracy from a simpler model using only a third of the total features! This is because we removed some noise and highly correlated features. 

# code and comments by github.com/alandavidgrunberg


['worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']

random forest w/ 10 features accuracy: 0.972027972027972
