In [1]:
# importing necessary libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [2]:
# getting the breast cancer data
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y = True)

In [3]:
# getting the shape of the dataset
print(X.shape)

(569, 30)


In [4]:
# train_test_split of the data in the ratio of 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [5]:
# getting the feature-to-label correlations
feature_to_label_correlations = []
for i in range(X_train.shape[1]):
    print('Correlation of Feature ' + str(i + 1) + ' with label: ', np.corrcoef(X_train[:,i], y_train)[0][1])
    feature_to_label_correlations.append(abs(np.corrcoef(X_train[:,i], y_train)[0][1]))

Correlation of Feature 1 with label:  -0.7256059801968108
Correlation of Feature 2 with label:  -0.4065894050560832
Correlation of Feature 3 with label:  -0.741853614062286
Correlation of Feature 4 with label:  -0.7083848675544729
Correlation of Feature 5 with label:  -0.39318582116345113
Correlation of Feature 6 with label:  -0.6309319771528702
Correlation of Feature 7 with label:  -0.7122789266531995
Correlation of Feature 8 with label:  -0.7882615846959873
Correlation of Feature 9 with label:  -0.3324446795512198
Correlation of Feature 10 with label:  -0.025924332654916445
Correlation of Feature 11 with label:  -0.5707563422154452
Correlation of Feature 12 with label:  0.015363784845475995
Correlation of Feature 13 with label:  -0.5675702605108826
Correlation of Feature 14 with label:  -0.5774554450398077
Correlation of Feature 15 with label:  0.04522064381388578
Correlation of Feature 16 with label:  -0.3127983050044024
Correlation of Feature 17 with label:  -0.25243924865243195
Co

In [6]:
# getting top 2 features having the highest correlation with the label
selected_features = np.argsort(feature_to_label_correlations)[::-1][:2]
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

In [7]:
# 5-Fold Grid-Search Cross Validation 
clf = RandomForestClassifier(random_state = 1234)
params = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth': [3, 4, 5]}
model_cv = GridSearchCV(estimator = clf, 
                        cv = 5,
                        param_grid = params,
                        scoring = 'accuracy',
                        verbose = 3)

model_cv.fit(X_train_selected, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END ..................max_depth=3, n_estimators=100; total time=   0.1s
[CV 2/5] END ..................max_depth=3, n_estimators=100; total time=   0.1s
[CV 3/5] END ..................max_depth=3, n_estimators=100; total time=   0.1s
[CV 4/5] END ..................max_depth=3, n_estimators=100; total time=   0.1s
[CV 5/5] END ..................max_depth=3, n_estimators=100; total time=   0.1s
[CV 1/5] END ..................max_depth=3, n_estimators=150; total time=   0.3s
[CV 2/5] END ..................max_depth=3, n_estimators=150; total time=   0.2s
[CV 3/5] END ..................max_depth=3, n_estimators=150; total time=   0.2s
[CV 4/5] END ..................max_depth=3, n_estimators=150; total time=   0.2s
[CV 5/5] END ..................max_depth=3, n_estimators=150; total time=   0.2s
[CV 1/5] END ..................max_depth=3, n_estimators=200; total time=   0.3s
[CV 2/5] END ..................max_depth=3, n_es

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1234),
             param_grid={'max_depth': [3, 4, 5],
                         'n_estimators': [100, 150, 200, 250, 300]},
             scoring='accuracy', verbose=3)

In [8]:
# getting the best parameters
model_cv.best_params_

{'max_depth': 3, 'n_estimators': 100}

In [9]:
# training the Random Forest Classifier Model
rf_model = RandomForestClassifier(max_depth = 3, n_estimators = 100, random_state = 1234).fit(X_train_selected, y_train)

In [10]:
# obtaining the Accuracy and Accuracy per feature of the trained Random Forest Classifier Model
print('Accuracy', accuracy_score(y_test, rf_model.predict(X_test_selected)))
print('Accuracy per feature', accuracy_score(y_test, rf_model.predict(X_test_selected))/X_test_selected.shape[1])

Accuracy 0.868421052631579
Accuracy per feature 0.4342105263157895
