In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
# import the data
df = pd.read_csv('data.csv')

# assign input variables
x = df.loc[:, ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']]

# assign target variable
y = df['diagnosis']

In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

## 1) Accuracy of a Decision Tree

In [8]:
# Accuracy of a decision tree with max_depth being 3 and the criterion for splitting is to use entropy.
tree1 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
tree1.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
tree1.score(x_train, y_train)

0.9538461538461539

In [11]:
y_pred = tree1.predict(x_test)
e1 = metrics.accuracy_score(y_pred, y_test)
e1

0.9385964912280702

### the accuracy of the decision tree with max_depth = 3 and the criterion being entropy is 0.9386

## 2) The Best Parameters for Decision Tree

In [12]:
param_grid = {'max_depth': range(2,7), 'criterion':['gini', 'entropy']}

from sklearn.model_selection import GridSearchCV
tree2 = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
tree2.fit(x_train, y_train)

tree2.best_params_

{'criterion': 'gini', 'max_depth': 6}

### the best parameters are 'gini' and a max depth of 6

## 3) Accuracy of Random Forest

In [15]:
rf1 = RandomForestClassifier(n_estimators=15, max_features=4)
rf1.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
rf1.score(x_train, y_train)

1.0

In [17]:
y_pred1 = rf1.predict(x_test)
metrics.accuracy_score(y_pred1, y_test)

0.9385964912280702

### the accuracy of the random forest is 0.9386

## 4) Best Parameters and Accuracy of Tuned Random Forest

In [18]:
param_grid = {'n_estimators': range(10, 25), 'max_features': range(3, 10)}

rf2 = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
rf2.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(10, 25), 'max_features': range(3, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
rf2.best_params_

{'max_features': 7, 'n_estimators': 18}

In [21]:
y_pred2 = rf2.predict(x_test)
metrics.accuracy_score(y_pred2, y_test)

0.9473684210526315

### the best parameters are 7 max features and 18 n_estimators
### the accuracy of the tuned random forest is 0.9473
### tuning the random forest did imporve the model compared to the previous random forest

# worked on with Neil Manderson