In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Загрузим готовый датасет Breast Cancer из библиотеки Sklearn

In [2]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

X = cancer.data
y = cancer.target
features = cancer.feature_names

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)
print("Features:\n", features)

Shape of X:  (569, 30)
Shape of y:  (569,)
Features:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


## Разделим данные на тренировочную(train)/проверочную(test) выборки в пропорции 80% к 20%

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=0)

## Запустим на тренировочных данных логистическую регрессию и сделаем предсказания для проверочной выборки

In [4]:
from sklearn.linear_model import LogisticRegression

log_regr = LogisticRegression(max_iter=10000, tol=0.00001)
log_regr.fit(X_train, y_train)

y_pred = log_regr.predict(X_test)

### Чтобы оценить качество работы модели, проверим такие критерии как матрицу путаницы (confusion matrix), точность предсказания (accuracy), меткость (precision), отзыв/чувствительность (recall) и счет  F1.

In [5]:
# Все метрики оценки хранятся в модуле metrics
from sklearn.metrics import *

cmat = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n" , cmat)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %.3f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %.3f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %.3f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %.3f' % f1)

Confusion matrix:
 [[46  1]
 [ 5 62]]
Accuracy: 0.947
Precision: 0.984
Recall: 0.925
F1 score: 0.954
