In [1]:
from sklearn.datasets import load_breast_cancer
breast_cancer_data = load_breast_cancer()
print(type(breast_cancer_data))

<class 'sklearn.utils.Bunch'>


In [2]:
print(breast_cancer_data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
print(breast_cancer_data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
print(type(breast_cancer_data.data))
print(breast_cancer_data.data.shape)
print(breast_cancer_data.target.shape)

<class 'numpy.ndarray'>
(569, 30)
(569,)


In [7]:
print(breast_cancer_data.data[0])
print(breast_cancer_data.target[0:5])

[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
[0 0 0 0 0]


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data.data, breast_cancer_data.target, test_size=0.30)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


(398, 30) (398,) (171, 30) (171,)


In [20]:
from sklearn import linear_model

logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=5000)

In [21]:
logistic = logistic.fit(X_train, y_train)
score = logistic.score(X_test, y_test)
print('LogisticRegression score:{}'.format(score))

LogisticRegression score:0.9532163742690059


In [22]:
ypred_test= logistic.predict(X_test)
print(ypred_test)

[1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1
 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0
 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 1
 0 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 0
 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1]


In [23]:
print(y_test)

[1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1
 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0
 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 1
 0 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 0
 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1]


In [24]:
for i in range(len(y_test)):
    if (y_test[i] != ypred_test[i]):
        print(i, y_test[i], ypred_test[i])

22 0 1
75 0 1
121 0 1
129 1 0
139 1 0
144 1 0
167 0 1
169 0 1


In [25]:
print("classes", logistic.classes_)
print("Total feature weights", logistic.coef_.shape)
print("Total Bias weights", logistic.intercept_.shape) 
print("Total iterations", logistic.n_iter_) 

classes [0 1]
Total feature weights (1, 30)
Total Bias weights (1,)
Total iterations [2234]


In [26]:
ypred_logproba= logistic.predict_log_proba(X_test[0:2])
print(ypred_logproba)
ypred_proba= logistic.predict_proba(X_test[0:2])
print(ypred_proba)

[[-6.11969553e+00 -2.20154706e-03]
 [-8.85242171e+00 -1.43045158e-04]]
[[2.19912543e-03 9.97800875e-01]
 [1.43034928e-04 9.99856965e-01]]


In [27]:
help (logistic)

Help on LogisticRegression in module sklearn.linear_model._logistic object:

class LogisticRegression(sklearn.base.BaseEstimator, sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin)
 |  Logistic Regression (aka logit, MaxEnt) classifier.
 |  
 |  In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
 |  scheme if the 'multi_class' option is set to 'ovr', and uses the
 |  cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
 |  (Currently the 'multinomial' option is supported only by the 'lbfgs',
 |  'sag', 'saga' and 'newton-cg' solvers.)
 |  
 |  This class implements regularized logistic regression using the
 |  'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
 |  that regularization is applied by default**. It can handle both dense
 |  and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
 |  floats for optimal performance; any other input format will be con

In [None]:
#try increasing accuracy
# 1. tune parameters for logistic api - Solver, C, penalty, class_weight
# 2. data normalization/standradization
# 3. feature selection - correlations