In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import scipy as sp
import sklearn 
import mglearn

In [3]:
from sklearn.datasets import load_breast_cancer 
cancer = load_breast_cancer()

In [16]:
# find target class distribution 
print({cl:ct for cl, ct in zip(cancer.target_names, np.bincount(cancer.target))})

{'malignant': 212, 'benign': 357}


In [17]:
# Unbalance target labels thus need to use stratify
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    stratify= cancer.target, random_state=66)

In [19]:
# fit model with default setting of c=1, penalty='l2'
from sklearn.linear_model import LogisticRegression 
logreg = LogisticRegression(max_iter=10000).fit(X_train, y_train)

In [22]:
print('training set accuracy: {:.2f}'.format(logreg.score(X_train, y_train)))
print('testing set accuracy: {:.2f}'.format(logreg.score(X_test,y_test)))

training set accuracy: 0.98
testing set accuracy: 0.94


In [25]:
# try with higher C to increase model complexity 
logreg100 = LogisticRegression(C=100, max_iter=10000).fit(X_train, y_train)

In [26]:
print('training set accuracy: {:.2f}'.format(logreg100.score(X_train, y_train)))
print('testing set accuracy: {:.2f}'.format(logreg100.score(X_test,y_test)))

training set accuracy: 0.99
testing set accuracy: 0.97


In [27]:
# try with lower c to increase model generalization
logreg001 = LogisticRegression(C=0.01, max_iter=10000).fit(X_train, y_train)

In [28]:
print('training set accuracy: {:.2f}'.format(logreg001.score(X_train, y_train)))
print('testing set accuracy: {:.2f}'.format(logreg001.score(X_test,y_test)))

training set accuracy: 0.96
testing set accuracy: 0.94


In [None]:
# slightly lower training and tesgting accuracies suggest underfitting 

In [31]:
# try with l1 penalty
for c in [0.001, 1, 100]:
    logreg_l1 = LogisticRegression(penalty='l1', C=c, max_iter=10000, solver='liblinear').fit(X_train, y_train)
    print('training accuracy with C={}: {}'.format(c, logreg_l1.score(X_train, y_train)))
    print('testing accuracy with C={}: {}'.format(c, logreg_l1.score(X_test, y_test)))

training accuracy with C=0.001: 0.9178403755868545
testing accuracy with C=0.001: 0.9020979020979021
training accuracy with C=1: 0.9741784037558685
testing accuracy with C=1: 0.9440559440559441
training accuracy with C=100: 0.9953051643192489
testing accuracy with C=100: 0.972027972027972
