# Breast Cancer - Random Forest Classification

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing Dataset

In [2]:
df = pd.read_csv('breast_cancer.csv')
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

## Split into Training Set and Test Set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training Model

In [5]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

## Evaluate Model

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(f'accuracy score: {"{:.2f}".format(accuracy_score(y_test, y_pred)*100)} %')

[[84  3]
 [ 1 49]]
accuracy score: 97.08 %


### Evaluate with k-Fold Cross Validation

In [7]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print(f'accuracy: {"{:.2f}".format(accuracies.mean()*100)} %')
print(f'standard deviation: {"{:.2f}".format(accuracies.std()*100)} %')

accuracy: 96.70 %
standard deviation: 2.58 %


##### 94.12 - 99.28 %

## Tuning Model

In [12]:
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [13]:
values = np.arange(0.0, 1.1, 0.1)
values = [round(i, 1) for i in values]
print(values)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [15]:
from sklearn.model_selection import GridSearchCV
parameters = [
    {
        'n_estimators': np.arange(10, 150, 10),
        'criterion': ['gini', 'entropy'],
        'min_impurity_decrease': values,
        'random_state': np.arange(0, 21)
    }
]

grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=parameters,
    scoring='accuracy',
    cv=10,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(f'best accuracy: {"{:.2f}".format(best_accuracy*100)} %')
print(f'best parameters: {best_parameters}')

best accuracy: 97.62 %
best parameters: {'criterion': 'entropy', 'min_impurity_decrease': 0.0, 'n_estimators': 80, 'random_state': 3}


In [16]:
df = pd.DataFrame(grid_search.cv_results_)
df.sort_values('rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_impurity_decrease,param_n_estimators,param_random_state,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3384,0.150863,0.012353,0.008704,0.000787,entropy,0.0,80,3,"{'criterion': 'entropy', 'min_impurity_decreas...",0.945455,...,1.0,0.945455,1.0,0.981481,0.962963,1.0,0.981481,0.976229,0.020026,1
3363,0.11602,0.011989,0.007603,0.00067,entropy,0.0,70,3,"{'criterion': 'entropy', 'min_impurity_decreas...",0.945455,...,1.0,0.945455,1.0,0.981481,0.962963,1.0,0.962963,0.974377,0.020309,2
148,0.1116,0.004903,0.0073,0.00064,gini,0.0,80,1,"{'criterion': 'gini', 'min_impurity_decrease':...",0.945455,...,1.0,0.945455,1.0,0.981481,0.962963,1.0,0.962963,0.974377,0.020309,2
127,0.093362,0.003063,0.006652,0.000553,gini,0.0,70,1,"{'criterion': 'gini', 'min_impurity_decrease':...",0.945455,...,1.0,0.945455,1.0,0.981481,0.962963,1.0,0.962963,0.974377,0.020309,2
3342,0.094005,0.006942,0.0065,0.000671,entropy,0.0,60,3,"{'criterion': 'entropy', 'min_impurity_decreas...",0.945455,...,1.0,0.945455,1.0,0.981481,0.962963,1.0,0.962963,0.974377,0.020309,2
