## Задача 1: Загрузка и изучение данных

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
data = load_breast_cancer()

In [None]:
df = pd.DataFrame(data.data, columns=data.feature_names)

In [None]:
df['target'] = data.target

In [None]:
print(df.shape)

In [None]:
print(df.columns)

In [None]:
df.isnull().sum()

## Задача 2: Первичный анализ данных

In [None]:
sns.countplot(x='target', data=df)

In [None]:
df.describe()

In [None]:
sns.scatterplot(x=df.columns[0], y=df.columns[1], hue='target', data=df)

## Задача 3: Предобработка данных

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.drop('target', axis=1)

In [None]:
y = df['target']

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Задача 4: Обучение модели (логистическая регрессия)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

## Задача 5: Сравнение с другими моделями

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.svm import SVC

In [None]:
models = {'KNN': KNeighborsClassifier(), 'RF': RandomForestClassifier(), 'SVC': SVC()}

In [None]:
accuracies = {}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    accuracies[name] = acc

In [None]:
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()))

## Задача 6: Кросс-валидация

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)
    print(f'{name}: Mean CV accuracy = {scores.mean():.4f}')

## Задача 7: ROC-кривая и AUC

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
probs = model.predict_proba(X_test)[:, 1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, probs)

In [None]:
roc_auc = auc(fpr, tpr)

In [None]:
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()

## Задача 8: Важность признаков

In [None]:
importances = model.feature_importances_

In [None]:
indices = np.argsort(importances)[-10:]

In [None]:
plt.barh(range(10), importances[indices], align='center')
plt.yticks(range(10), [data.feature_names[i] for i in indices])

## Задача 9: Гиперпараметрическая настройка

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'n_neighbors': range(1, 10)}

In [None]:
grid = GridSearchCV(KNeighborsClassifier(), param_grid=params, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print('Best params:', grid.best_params_)

In [None]:
print('Best score:', grid.best_score_)

## Задача 10: Выводы

In [None]:
# Ответьте на следующие вопросы:

In [None]:
# 1. Какая модель показала наибольшую точность?

In [None]:
# 2. Какие признаки оказались наиболее значимыми?

In [None]:
# 3. Как масштабирование повлияло на модели?

In [None]:
# 4. Что вы узнали из анализа ROC и AUC?