# Analysis of the Wisconsin Breast Cancer Data

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
# Import own modules
from regression_analysis.fit_model import logistic_regression, apply_logistic_regression

# Import other packages
import pandas as pd
import ipywidgets as widget
import seaborn as sns
import numpy as np

## Data

In [28]:
# Load data
data = pd.read_csv('data_logistic_regression/data.csv', sep=',')
# Display data
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99000,10.38000,122.80000,1001.00000,0.11840,0.27760,0.30010,0.14710,...,25.38000,17.33000,184.60000,2019.00000,0.16220,0.66560,0.71190,0.26540,0.46010,0.11890
1,842517,M,20.57000,17.77000,132.90000,1326.00000,0.08474,0.07864,0.08690,0.07017,...,24.99000,23.41000,158.80000,1956.00000,0.12380,0.18660,0.24160,0.18600,0.27500,0.08902
2,84300903,M,19.69000,21.25000,130.00000,1203.00000,0.10960,0.15990,0.19740,0.12790,...,23.57000,25.53000,152.50000,1709.00000,0.14440,0.42450,0.45040,0.24300,0.36130,0.08758
3,84348301,M,11.42000,20.38000,77.58000,386.10000,0.14250,0.28390,0.24140,0.10520,...,14.91000,26.50000,98.87000,567.70000,0.20980,0.86630,0.68690,0.25750,0.66380,0.17300
4,84358402,M,20.29000,14.34000,135.10000,1297.00000,0.10030,0.13280,0.19800,0.10430,...,22.54000,16.67000,152.20000,1575.00000,0.13740,0.20500,0.40000,0.16250,0.23640,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56000,22.39000,142.00000,1479.00000,0.11100,0.11590,0.24390,0.13890,...,25.45000,26.40000,166.10000,2027.00000,0.14100,0.21130,0.41070,0.22160,0.20600,0.07115
565,926682,M,20.13000,28.25000,131.20000,1261.00000,0.09780,0.10340,0.14400,0.09791,...,23.69000,38.25000,155.00000,1731.00000,0.11660,0.19220,0.32150,0.16280,0.25720,0.06637
566,926954,M,16.60000,28.08000,108.30000,858.10000,0.08455,0.10230,0.09251,0.05302,...,18.98000,34.12000,126.70000,1124.00000,0.11390,0.30940,0.34030,0.14180,0.22180,0.07820
567,927241,M,20.60000,29.33000,140.10000,1265.00000,0.11780,0.27700,0.35140,0.15200,...,25.74000,39.42000,184.60000,1821.00000,0.16500,0.86810,0.93870,0.26500,0.40870,0.12400


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

For the design matrix we drop the column id and diagnosis from the data. The id is not important for making predictions and the diagnosis is what we want to predict.

In [30]:
# Transform diagnosis to 0 and 1
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis]

# Create design matrix and normalise it
X = logistic_regression.design_matrix(data)
X_norm = logistic_regression.normalise_data(X)

# Define to be predicted values
y = data.diagnosis.values

## Perform Logistic regression

In [31]:
def perform_simple_logistic_regression(reg_method, sample_method, data_set):
    
    logistic_regression_object = logistic_regression.LogisticRegression(X_norm, y)
    
    if sample_method == "crossvalidation":
        logistic_regression_object.apply_logistic_regression_crossvalidation(reg_method=reg_method)
    else:
        logistic_regression_object.apply_logistic_regression(reg_method=reg_method)

    if data_set == 'train':
        sns.heatmap(logistic_regression_object.train_confusion_matrix, annot=True, cmap="mako", yticklabels=['is_malignant', 'is_benign'],
                    xticklabels=['predicted_malignent', 'predicted_benign'])
    else:
        sns.heatmap(logistic_regression_object.test_confusion_matrix, annot=True, cmap="mako", yticklabels=['is_malignant', 'is_benign'],
                    xticklabels=['predicted_malignent', 'predicted_benign'])
    
widget.interact(perform_simple_logistic_regression, reg_method=["logistic_sgd", "logistic_scikit", "svm"], 
                sample_method=[None, "crossvalidation"], data_set=['train', 'test'])


interactive(children=(Dropdown(description='reg_method', options=('logistic_sgd', 'logistic_scikit', 'svm'), v…

<function __main__.perform_simple_logistic_regression(reg_method, sample_method, data_set)>

## Compare Methods

In [32]:
# For all methods we apply the following parameters
num_points = np.array([25, 50, 75, 100])
test_ratio = np.round(np.arange(1,5)*0.1, 2)

k_folds = np.array([5, 10], dtype=int)

l2_lambda = np.array([0.001, 0.01, 0.1, 1.0])

# Parameters for stochastic gradient descent
learn_rate = np.array([0.001, 0.01, 0.1, 1.0])
num_min_batch = np.array([2, 5, 10, 50])
epochs = np.array([10, 50, 100])

In [33]:
# Save values in numpy file format
np.save("data_logistic_regression/num_points.npy", num_points)
np.save("data_logistic_regression/test_ratio.npy", test_ratio)
np.save("data_logistic_regression/k_folds.npy", k_folds)
np.save("data_logistic_regression/l2_lambda.npy", l2_lambda)
np.save("data_logistic_regression/learn_rates.npy", learn_rate)
np.save("data_logistic_regression/num_min_batches.npy", num_min_batch)
np.save("data_logistic_regression/epochs.npy", epochs)

In [34]:
# Define for which methods and resampling techniques the statistical indicators should be calculated

methods = ["logistic_sgd", "logistic_sgd_crossvalidation", "logistic_scikit",  "logistic_scikit_crossvalidation", 
           "svm", "svm_crossvalidation"]
# Loop over specified methods
for method in methods:
    if "crossvalidation" in method:
        test_ratio_array = np.ones(1)*0.1
    else:
        k_folds = np.ones(1, dtype=int)
    if "ridge" not in method:
        ridge_lambda = np.ones(1)
    if "sgd" not in method:
        l2_lambda = np.ones(1)
    # Calculate statistical indicators
    train_accuracy, test_accuracy = apply_logistic_regression.apply_regression(num_points,
                                                                                   test_ratios=test_ratio,
                                                                                   k_folds=k_folds, 
                                                                                   l2_lambda=l2_lambda,
                                                                                   reg_type=method, 
                                                                                   learn_rate=learn_rate,
                                                                                   num_min_batch=num_min_batch,
                                                                                   epochs=epochs)
    # Save output
    np.save("data_logistic_regression/train_accuracy"+str(method)+".npy", train_accuracy)
    np.save("data_logistic_regression/test_accuracy"+str(method)+".npy", test_accuracy)
    # np.save("data_logistic_regression/train_confusion_matrix"+str(method)+".npy", train_confusion_matrix)
    # np.save("data_logistic_regression/test_confusion_matrix"+str(method)+".npy", test_confusion_matrix)
    # To track loop progress print size of MSE output
    print(train_MSE.shape)


  return 1 / (1 + np.exp(-z))
  gradient = (-1) * X.T @ (y - sigmoid_func(X @ beta)) - lmbda*beta
  vector -= descend
  gradient = (-1) * X.T @ (y - sigmoid_func(X @ beta)) - lmbda*beta


(4, 4, 4, 1, 4, 4, 3)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(4, 4, 4, 1, 4, 4, 3)
(4, 4, 4, 1, 4, 4, 3)


ValueError: Found array with 0 sample(s) (shape=(0, 31)) while a minimum of 1 is required.

In [38]:
"""methods = ["logistic_sgd", "logistic_sgd_crossvalidation", "logistic_scikit",  "logistic_scikit_crossvalidation", 
           "svm", "svm_crossvalidation"]"""

methods = ["logistic_sgd", "logistic_sgd_crossvalidation"]
stats = ["train accuracy", "test accuracy"]

widget.interact(apply_logistic_regression.plot_stat, ratio=test_ratio.tolist(), num=num_points.tolist(), stat=stats, 
                method=methods, k_fold=k_folds.tolist(), l2_lambda=l2_lambda.tolist(), learn_rate=learn_rate.tolist(),
                batch=num_min_batch.tolist(), epoch=epochs.tolist())

interactive(children=(Dropdown(description='ratio', options=(0.1, 0.2, 0.3, 0.4), value=0.1), Dropdown(descrip…

<function regression_analysis.fit_model.apply_logistic_regression.plot_stat(ratio=0.1, num=100, stat='test accuracy', method='logistic_sgd', k_fold=1000, l2_lambda=1, learn_rate=0.1, batch=5, epoch=50)>