In [14]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
#Import iris dataset
wdbc = fetch_openml(name='wdbc')

In [3]:
# Have a look at the dataset
print(wdbc)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array(['2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2',
       '2', '2', '2', '2', '2', '2', '1', '1', '1', '2', '2', '2', '2',
       '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '1', '2',
       '2', '2', '2', '2', '2', '2', '2', '1', '2', '1', '1', '1', '1',
       '1', '2', '2', '1', '2', '2', '1', '1', '1', '1', '2', '1', '2',
       '2', '1', '1', '1', '1', '2', '1', '2', '2', '1', '2', '1', '2',
       

In [4]:
# Dataset URL
wdbc.url

'https://www.openml.org/d/1510'

In [5]:
# Non graphical EDA
wdbc.data.shape

(569, 30)

In [6]:
# Non graphical EDA
wdbc.target.shape

(569,)

In [7]:
# Non graphical EDA
np.unique(wdbc.target)

array(['1', '2'], dtype=object)

In [8]:
# Non graphical EDA
wdbc.DESCR

'**Author**: William H. Wolberg, W. Nick Street, Olvi L. Mangasarian    \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)), [University of Wisconsin](http://pages.cs.wisc.edu/~olvi/uwmp/cancer.html) - 1995  \n**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)     \n\n**Breast Cancer Wisconsin (Diagnostic) Data Set (WDBC).** Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. The target feature records the prognosis (benign (1) or malignant (2)). [Original data available here](ftp://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/) \n\nCurrent dataset was adapted to ARFF format from the UCI version. Sample code ID\'s were removed.  \n\n! Note that there is also a related Breast Cancer Wisconsin (Original) Data Set with a different set of features, better known as [breast-w](https://www.openml.or

In [9]:
# Non graphical EDA
wdbc.details

{'id': '1510',
 'name': 'wdbc',
 'version': '1',
 'format': 'ARFF',
 'upload_date': '2015-05-26T16:24:07',
 'licence': 'Public',
 'url': 'https://www.openml.org/data/v1/download/1592318/wdbc.arff',
 'file_id': '1592318',
 'default_target_attribute': 'Class',
 'tag': ['cancer',
  'medical',
  'OpenML-CC18',
  'OpenML100',
  'study_123',
  'study_135',
  'study_14',
  'study_52',
  'study_7',
  'study_98',
  'study_99',
  'uci'],
 'visibility': 'public',
 'status': 'active',
 'processing_date': '2018-10-03 21:41:34',
 'md5_checksum': '7aa183d3657e364911ced0cbd6b272bd'}

In [10]:
wdbc.feature_names

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30']

In [11]:
x = pd.DataFrame(wdbc.data, columns = wdbc.feature_names)
x.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
y = pd.DataFrame(wdbc.target, columns = ['Class'])
y.head()

Unnamed: 0,Class
0,2
1,2
2,2
3,2
4,2


In [17]:
model = AdaBoostClassifier(DecisionTreeClassifier
                           (max_depth = 1), 
                           n_estimators=200)

accs = []
pres = []
recs = []
f1_scores = []

# Training model with Repeated stratified K fold cross validation
rskf = RepeatedStratifiedKFold(n_splits=10,
                               n_repeats = 10, 
                               random_state=36851234)

for train_index, test_index in rskf.split(x, y):
    model.fit(x.iloc[train_index], y.iloc[train_index])
    y_pred = model.predict(x.iloc[test_index])
    acc_score = accuracy_score(y.iloc[test_index], y_pred)
    prec_score = precision_score(y.iloc[test_index], y_pred, average = 'micro')
    rec_score = recall_score(y.iloc[test_index], y_pred, average = 'micro')
    f1s = f1_score(y.iloc[test_index], y_pred, average = 'micro')
    
    accs.append(acc_score)
    pres.append(prec_score)
    recs.append(rec_score)
    f1_scores.append(f1s)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [18]:
print("Accuracy :", np.mean(accs), "\nPrecision :", 
      np.mean(pres), "\nRecall :", np.mean(rec_score), 
      "\nF1 score :", np.mean(f1_scores) )

Accuracy : 0.9702814147437561 
Precision : 0.9702814147437561 
Recall : 1.0 
F1 score : 0.9702814147437561
