# Workshop 11

Starter code for workshop 11 is just like workshop 4 at the beginning. 

Modified by Hu Wang

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To plot even prettier figures
import seaborn as sn

# General data handling (pure numerics are better in numpy)
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [3]:
xarray = data.data
yarray = data.target
print(xarray.shape)
print(yarray.shape)
fullarray = np.concatenate((xarray,np.reshape(yarray,(-1,1))),axis=1)
print(fullarray.shape)

(569, 30)
(569,)
(569, 31)


In [4]:
fullarray[:,-1] = 1 - fullarray[:,-1]   # now invert the labels (so that malignant=1)
df = pd.DataFrame(fullarray,columns = list(data.feature_names) + ['target'])

In [5]:
from sklearn.model_selection import train_test_split

bigtrain_set, test_set = train_test_split(fullarray, test_size=0.2, random_state=42)
train_set, val_set = train_test_split(bigtrain_set, test_size=0.2, random_state=42)

In [6]:
X_bigtrain = bigtrain_set[:,:-1]
y_bigtrain = bigtrain_set[:,-1]
X_train = train_set[:,:-1]
y_train = train_set[:,-1]
X_test = test_set[:,:-1]
y_test = test_set[:,-1]
X_val = val_set[:,:-1]
y_val = val_set[:,-1]
print([X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape])

[(364, 30), (364,), (114, 30), (114,), (91, 30), (91,)]


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")), 
                        ('std_scaler', StandardScaler()) ])

# New part - statistical tests for classifier output

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

In [9]:
rs = np.random.randint(100,size=1)[0]

# run three classifiers
#  - the first two should be equally good on average (but different each time)
#  - the last one should be different on average (last one gives same result each time)

y_val_pred = []
pipes = []

pipes += [ Pipeline([ ('preproc',preproc_pl), ('sgd',SGDClassifier(loss='log',random_state=rs)) ]) ]
pipes[0].fit(X_train,y_train)
y_val_pred +=  [ pipes[0].predict(X_val) ]

pipes += [ Pipeline([ ('preproc',preproc_pl), ('sgd',SGDClassifier(loss='log',random_state=rs+1)) ]) ]
pipes[1].fit(X_train,y_train)
y_val_pred += [ pipes[1].predict(X_val) ]

# this next one is deliberately rather poor, to show a difference
pipes += [ Pipeline([ ('preproc',preproc_pl), ('knn',KNeighborsClassifier(n_neighbors=100)) ]) ]
pipes[2].fit(X_train,y_train)
y_val_pred += [ pipes[2].predict(X_val) ]

In [10]:
from sklearn.metrics import f1_score

print('Which score is better?')

for n in range(3):
    print(f'Method {n}: {f1_score(y_val,y_val_pred[n]):.3f}')


Which score is better?
Method 0: 0.946
Method 1: 0.944
Method 2: 0.912


In [11]:
from mlxtend.evaluate import permutation_test   # pip install mlxtend

In [13]:
ntests = 0
for n1 in range(2):
    for n2 in range(n1+1,3):
        p_value = permutation_test(
            y_val_pred[n1], y_val_pred[n2], #paired=True,  
            func=lambda x, y: np.abs(f1_score(y_val,x) - f1_score(y_val,y)),
            method="approximate", seed=0, num_rounds=1000
        )
        ntests += 1

        print(f'P value comparing methods {n1} and {n2}: {p_value:.3f} %')
print(f'\nThreshold is {0.05/ntests:.4f}, where P value needs to be *below* this for significance')

P value comparing methods 0 and 1: 0.976 %
P value comparing methods 0 and 2: 0.730 %
P value comparing methods 1 and 2: 0.754 %

Threshold is 0.0167, where P value needs to be *below* this for significance


In [14]:
# Cross validation results for comparison
#  - the P values above are the best test, but this is a useful comparison

from sklearn.model_selection import cross_validate
from sklearn.metrics import fbeta_score, make_scorer

f1_scorer = make_scorer(fbeta_score, beta=1)

for n in range(3):
    print(f'*** Results for method {n} ***')
    cv_results = cross_validate(pipes[n], X_bigtrain, y_bigtrain, cv=5, return_train_score=True, scoring=f1_scorer)
    # the following is a 95% confidence interval calculation
    # - this is better than standard deviation as overlapping intervals => no stat significance (approx) 
    # Note that this is approximate as it assumes a normal distribution 
    #  it also does not take into account multiple tests
    #  therefore it implies significance more easily than the P values above (which are better)
    CI = np.std(cv_results['test_score'])*1.96/np.sqrt(cv_results['test_score'].shape[0])
    print(f"  Validation results are {np.mean(cv_results['test_score']):.3f} +/- {CI:.3f}")

*** Results for method 0 ***
  Validation results are 0.943 +/- 0.024
*** Results for method 1 ***
  Validation results are 0.950 +/- 0.026
*** Results for method 2 ***
  Validation results are 0.873 +/- 0.039
