# Analysis of the Wisconsin Breast Cancer Data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Import own modules
from regression_analysis.fit_model import logistic_regression, apply_logistic_regression

# Import other packages
import pandas as pd
import ipywidgets as widget
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

## Data

In [10]:
# Load data
data = pd.read_csv('data_logistic_regression/data.csv', sep=',')
# Display data
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [12]:
data.info()
data.max()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

id                         911320502
diagnosis                          M
radius_mean                    28.11
texture_mean                   39.28
perimeter_mean                 188.5
area_mean                     2501.0
smoothness_mean               0.1634
compactness_mean              0.3454
concavity_mean                0.4268
concave points_mean           0.2012
symmetry_mean                  0.304
fractal_dimension_mean       0.09744
radius_se                      2.873
texture_se                     4.885
perimeter_se                   21.98
area_se                        542.2
smoothness_se                0.03113
compactness_se                0.1354
concavity_se                   0.396
concave points_se            0.05279
symmetry_se                  0.07895
fractal_dimension_se         0.02984
radius_worst                   36.04
texture_worst                  49.54
perimeter_worst                251.2
area_worst                    4254.0
smoothness_worst              0.2226
c

For the design matrix we drop the column id and diagnosis from the data. The id is not important for making predictions and the diagnosis is what we want to predict.

In [4]:
# Transform diagnosis to 0 and 1
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis]

# Create design matrix and normalise it
X = logistic_regression.design_matrix(data)
X_norm = logistic_regression.normalise_data(X)

# Define to be predicted values
y = data.diagnosis.values

## Fit a Model to the Data

### Find the right Kernel for the Support Vector Machine Algorithm

In [9]:
# Split data on test and train datasets
x_train, x_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.1)

# Use grid search to find best fit
classifier = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100]}
grid_svm = GridSearchCV(estimator=classifier, param_grid=parameters, scoring="accuracy")
grid_svm.fit(x_train, y_train)

# Print output
print(grid_svm.cv_results_)
print("Best estimator: " + str(grid_svm.best_estimator_))
print("Best Score: " + str(grid_svm.best_score_))

{'mean_fit_time': array([0.00185699, 0.00214262, 0.00130119, 0.00147777, 0.0016541 ,
       0.00160599]), 'std_fit_time': array([0.00034285, 0.00026644, 0.00010769, 0.00013741, 0.00010135,
       0.00014379]), 'mean_score_time': array([0.00056229, 0.00063677, 0.00027623, 0.00038619, 0.00025535,
       0.00037198]), 'std_score_time': array([1.45904893e-04, 1.21058653e-04, 2.74314433e-05, 1.47422133e-05,
       2.29516738e-05, 2.44340777e-05]), 'param_C': masked_array(data=[1, 1, 10, 10, 100, 100],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1, 'kernel': 'linear'}, {'C': 1, 'kernel': 'rbf'}, {'C': 10, 'kernel': 'linear'}, {'C': 10, 'kernel': 'rbf'}, {'C': 100, 'kernel': 'linear'}, {'C': 100, 'kernel': 'rbf'}]

Therefore, using a linear or r

### Compare Methods

In [4]:
# For all methods we apply the following parameters
test_ratio = np.round(np.arange(1,5)*0.1, 2)
k_folds = np.array([5, 10], dtype=int)
l2_lambda = np.array([0.001, 0.01, 0.1, 1.0]) #L2 regularization parameter

# Parameters for stochastic gradient descent
learn_rate = np.array([0.001, 0.01, 0.1, 1.0])
num_min_batch = np.array([1, 2, 5, 10, 32])
epochs = np.array([10, 100, 500, 1000])

In [5]:
methods = ["logistic_sgd", "logistic_sgd_crossvalidation", "logistic_scikit",  "logistic_scikit_crossvalidation", 
           "svm", "svm_crossvalidation"]

widget.interact(apply_logistic_regression.plot_heatmap_conf_matrix, test_ratio=test_ratio.tolist(), 
                reg_type=methods, k_fold=k_folds.tolist(), l2_lambda=l2_lambda.tolist(), 
                learn_rate=learn_rate.tolist(), num_min_batch=num_min_batch.tolist(), epoch=epochs.tolist())

interactive(children=(Dropdown(description='reg_type', options=('logistic_sgd', 'logistic_sgd_crossvalidation'…

<function regression_analysis.fit_model.apply_logistic_regression.plot_heatmap_conf_matrix(reg_type, l2_lambda, learn_rate, num_min_batch, epoch, test_ratio, k_fold)>

### Study Dependence of SGD on learning rate

In [8]:
widget.interact(apply_logistic_regression.plot_accuracy, test_ratio=test_ratio.tolist(), 
                reg_type=widget.fixed("svm"), k_fold=k_folds.tolist(), l2_lambda=l2_lambda.tolist(), 
                learn_rate=widget.fixed(learn_rate.tolist()), num_min_batch=num_min_batch.tolist(), epoch=epochs.tolist())

interactive(children=(Dropdown(description='l2_lambda', options=(0.001, 0.01, 0.1, 1.0), value=0.001), Dropdow…

<function regression_analysis.fit_model.apply_logistic_regression.plot_accuracy(reg_type, l2_lambda, learn_rate, num_min_batch, epoch, test_ratio, k_fold)>

In [7]:
widget.interact(apply_logistic_regression.plot_accuracy, test_ratio=test_ratio.tolist(), 
                reg_type=widget.fixed("logistic_sgd_crossvalidation"), k_fold=k_folds.tolist(), l2_lambda=l2_lambda.tolist(), 
                learn_rate=widget.fixed(learn_rate.tolist()), num_min_batch=num_min_batch.tolist(), epoch=epochs.tolist())

interactive(children=(Dropdown(description='l2_lambda', options=(0.001, 0.01, 0.1, 1.0), value=0.001), Dropdow…

<function regression_analysis.fit_model.apply_logistic_regression.plot_accuracy(reg_type, l2_lambda, learn_rate, num_min_batch, epoch, test_ratio, k_fold)>