# Импорты

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from linear_regression_matrix import RidgeRegressionMatrix
from linear_regression import LinearRegression
from svm import SVMClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import classification_report, roc_curve, mean_squared_error, f1_score

warnings.filterwarnings("ignore")

# Функции для отрисовки

In [None]:
def show_histplot(data: pd.DataFrame):
    data.hist(bins=20,figsize=(12,12))


def get_boxplot(df_column, column_name):
    pd.DataFrame(df_column).boxplot(sym='o', whis=1.0, showmeans=True)
    plt.show()


def get_3d(param1: list[int], param2: list[int], result: list[int], name_param1: str, name_param2: str):
    fig = plt.figure()
    ax = plt.axes(projection ='3d')
    ax.plot3D(param1, param2, result, 'green')
    ax.set_title(f'Зависимость метрики R² от {name_param1} и {name_param2}')
    plt.show()

def get_2d(param1: list[int], result: list[int], name_param1: str):
    plt.title(f'Зависимость метрики/лосса  от {name_param1}')
    plt.plot(param1, result)

def plot_variance(pca, width=8, dpi=100):
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    fig.set(figwidth=8, dpi=100)
    return axs

def distribution_plots(h, w, X_train):
    fig3, ax3 = plt.subplots(h, w, sharex=True,figsize=(8, 8))
    axes_list = [item for sublist in ax3 for item in sublist]
    for col in X_train.columns:
        ax=axes_list.pop(0)
        sns.distplot(X_train[col], ax=ax)

    for ax in axes_list:
        ax.remove()

# Метрики классификации

In [None]:

def output_metrics_classification(y_test: pd.Series, preds: pd.Series):
    report = classification_report(y_test, preds, output_dict=True)
    return report

def output_roc_auc(y_test: pd.Series, preds: pd.Series):
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(5, 4))
    fpr, tpr, thresholds = roc_curve(y_test, preds, pos_label=1)
    lw = 2
    plt.plot(fpr, tpr, lw=lw, label='ROC curve ')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.savefig("ROC.png")
    plt.show()

# EDA

In [None]:
data = pd.read_csv('data/breast-cancer.csv')
data['diagnosis'] = data['diagnosis'].replace({'M': 1, 'B': -1})
X = data
y = X['diagnosis']
y[y == 0] = -1
X = X.drop('diagnosis', axis=1, inplace=False)
data.describe()

## Чекаем дупликаты

In [None]:
print(f"Number of missing value:{data.isna().sum().sum()}")

## Смотрим распределение

In [None]:
show_histplot(data)

## Смотрим на выбросы

### Ящик с усами

In [None]:
[get_boxplot(data[column], column) for column in data.columns if column != 'diagnosis']

### Смотрим на выбросы в процентах

In [None]:
def find_outliers(df):
    outliers = {}
    for col in df.columns:
        v = df[col]
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers_count = ((v < lower_bound) | (v > upper_bound)).sum()
        perc = outliers_count * 100.0 / len(df)
        outliers[col] = (perc, outliers_count)
        print(f"Column {col} outliers = {perc:.2f}%")

    return outliers

outliers = find_outliers(data)

In [None]:
data = data.drop(['area_se', 'perimeter_se', 'radius_se', 'area_mean'], axis=1)

## Тепловая карта

In [None]:
sns.heatmap(data.corr(method='spearman'), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')

**Вывод из тепловой карты:**

Такое ощущение, что B-признаки просто не влияют на таргет, но давайте посмотрим взаимную информацию, хотя интуитивно они должны влиять.

## Mutual Information

In [None]:
X['mean'] = X[X.columns].mean(axis=1)
X['std'] = X[X.columns].std(axis=1)
X['max'] = X[X.columns].max(axis=1)
X['median'] = X[X.columns].median(axis=1)

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

discrete_features = X.dtypes == int
mi_scores = make_mi_scores(X, y, discrete_features)
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
mi_scores[::3]

In [None]:
data = data.drop(['fractal_dimension_mean', 'fractal_dimension_se', 'compactness_se', 'id'], axis=1)

**Итог из взаимной информации:**

Вывод из тепловой карты оказался неправильным, ведь как мы видим имеется взаимосвязь между таргетом и B-признаками.

А еще нет смысла добавлять mean/max/std/median.

# Feature Engineering

## Scaling

Можно по-разному масштабировать признаки:

1) логирование

2) MinMax

3) Standart scaling

In [None]:
def feature_transform(type_scaling, data):
    if type_scaling == 'standard':
        tran_fn = StandardScaler()
    elif type_scaling =='minmax':
        tran_fn = MinMaxScaler()
    elif type_scaling =='log':
        tran_fn = FunctionTransformer(np.log1p, validate=True)

    transfx_data = tran_fn.fit_transform(data.astype(float))
    transfx_data = pd.DataFrame(transfx_data, columns = data.columns)
    return transfx_data

X_minmax = feature_transform('minmax', X)
X_standard = feature_transform('standard', X)
X_log = feature_transform('log', X)

In [None]:
distribution_plots(4,3, X_log.iloc[:,:11])

In [None]:
distribution_plots(4,3, X_standard.iloc[:,:11])

In [None]:
distribution_plots(4,3, X_minmax.iloc[:,:11])

В таком случае лучше просто использовать MinMax

## PCA

In [None]:
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

new_pca_df = pd.DataFrame(
    pca.components_.T,
    columns=component_names,
    index=X.columns,
)
plot_variance(pca)

После 6 компоненты примерно уровень объясненной дисперсии не особо меняется.

In [None]:
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
pca = PCA(n_components=7)
X_pca = pca.fit_transform(X_scaled)
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

pca_df = pd.DataFrame(
    pca.components_.T,
    columns=component_names,
    index=X.columns,
)
X_pca

# Матричное решение

## Обучение на MinMax данных

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_log, y, test_size=0.3, random_state=42)

rid_reg = RidgeRegressionMatrix(lambda_=0.1)
rid_reg.fit(X_train, y_train)
prob_predictions = rid_reg.predict_proba(X_test)
preds = rid_reg.predict(X_test)
print(classification_report(y_test, preds))
output_roc_auc(y_test, prob_predictions)

## Обучение на данных PCA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

rid_reg = RidgeRegressionMatrix(lambda_=0.1)
rid_reg.fit(X_train, y_train)
prob_predictions = rid_reg.predict_proba(X_test)
preds = rid_reg.predict(X_test)
print(classification_report(y_test, preds))
output_roc_auc(y_test, prob_predictions)

# Линейная регрессия

In [None]:
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(X_pca, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = X_train_pd.to_numpy(), X_test_pd.to_numpy(), y_train_pd.to_numpy(), y_test_pd.to_numpy()

## Ищем лучшие гиперпараметры

In [None]:
import itertools

rates = [0.1, 0.01]
epoches = [50,100,150,200,300]
batches = [50,100,150, 200, 300]
best_f1 = -1
best_params = {}
for rate, epoche, batch in itertools.product(rates, epoches, batches):
    model_reg = LinearRegression(learning_rate=rate, max_epoches=epoche, size_batch=batch, eps=0.0000001)
    model_reg.fit(X_train, y_train)
    linreg_preds = model_reg.predict(X_test)
    mean_f1 = output_metrics_classification(y_test, linreg_preds)
    if mean_f1['macro avg']['f1-score'] > best_f1:
        best_f1 = mean_f1['macro avg']['f1-score']
        best_params['learning_rate'] = rate
        best_params['max_epoches'] = epoche
        best_params['size_batch'] = batch



In [None]:
print(best_params)
print(f"best_f1 = {best_f1}")

# SVM

## Ищем лучшие гиперпараметры

In [None]:
# SVM
c_arr = [0.1, 0.01]
lr_arr = [0.1, 0.01]
arr_epoches = [100, 150]
best_f1 = -1
best_params = {}
for c, lr, epochs in itertools.product(c_arr, lr_arr, arr_epoches):
    sigma = 1
    classifier = SVMClassifier(c, lr, epochs, sigma)
    classifier.fit(X_train_pd, y_train_pd)
    svm_predicts = classifier.predict(y_test_pd)
    mean_f1 = output_metrics_classification(y_test, svm_predicts)
    if mean_f1['macro avg']['f1-score'] > best_f1:
        best_f1 = mean_f1['macro avg']['f1-score']
        best_params['learning_rate'] = rate
        best_params['max_epoches'] = epoche
        best_params['size_batch'] = batch

print(best_params)
print(f"best_f1 = {best_f1}")

# Кривые обучения моих моделей

In [None]:
# линрег
model_reg = LinearRegression(learning_rate=0.1, max_epoches=150, size_batch=50, eps=0.000000001)
model_reg.fit_test(X_train, y_train, X_test, y_test, 150)

In [None]:
# SVM
model_svm = SVMClassifier(c=1, learning_rate=0.1, epochs=150, sigma=1)
model_svm.fit_test(X_train_pd, y_train_pd, X_test_pd, y_test_pd)

In [None]:
# матричный линрег
rid_reg = RidgeRegressionMatrix(lambda_=0.1)
rid_reg.fit(X_train, y_train)
preds = rid_reg.predict(X_test)
matrix_result = np.full(150, f1_score(y_test, preds))

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].set_title('Трейн')
ax[0].set_xlabel('Кол-во эпох')
ax[0].set_ylabel('Эмпирический риск')
ax[0].plot(model_reg.train_losses, label='LinReg')
ax[0].plot(model_svm.train_losses, label='SVM')
ax[0].legend()

ax[1].set_title('Тест')
ax[1].set_xlabel('Кол-во эпох')
ax[1].set_ylabel('F-1')
ax[1].plot(model_reg.test_losses, label='LinReg')
ax[1].plot(model_svm.test_losses, label='SVM')
ax[1].plot(matrix_result, label='Матричный LinReg')
ax[1].legend()

# Библиотечные алгоритмы

In [None]:
# Линейная регрессия на градиентном спуске
logreg = LogisticRegression(max_iter=1, warm_start=True, solver='lbfgs', random_state=42)
logreg_test_losses = []
for epoch in range(150):
      logreg.fit(X_train_pd, y_train_pd)
      test_predictions = logreg.predict(X_test_pd)
      logreg_test_losses.append(f1_score(y_test_pd, test_predictions))

In [None]:
# Линейная регрессия на матричном решении
matrix_reg = SGDClassifier(max_iter=150)
matrix_reg.fit(X_train_pd, y_train_pd)
test_predictions = matrix_reg.predict(X_test_pd)
sgd_test_losses = np.full(150, f1_score(y_test_pd, test_predictions))

In [None]:
from sklearn.svm import SVC

# Метод опорных векторов (SVM)
svm_sklearn = SVC()
svc_test_losses = []
for epoch in range(150):
    svm_sklearn.fit(X_train_pd, y_train_pd)  # Обучаем модель
    y_test_pred = svm_sklearn.predict(X_test_pd)
    svc_test_losses.append(f1_score(y_test_pd, y_test_pred))

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

ax.set_title('Тест')
ax.set_xlabel('Количество эпох')
ax.set_ylabel('F-1')
ax.plot(logreg_test_losses, label='SGD')
ax.plot(svc_test_losses, label='SVM')
ax.plot(sgd_test_losses, label='Matrix')
ax.legend()

# Сравнение линейных уравнений регрессий и коэффициентов опорных векторов

In [None]:
# Сравнение линейных уравнений регрессий
print('MSE: ', mean_squared_error(model_reg.w, logreg.coef_.flatten()))
