In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# 1. Logistic Regression

URL https://realpython.com/logistic-regression-python/#classification

<!--
### $\hat{y}^{(i)}=\beta_{0}+\beta_{1}x^{(i)}_{1}+\ldots+\beta_{p}x^{(i)}_{p}$

### $ P(y^{(i)}=1)=\frac{1}{1+e^{-(\beta_{0}+\beta_{1}x^{(i)}_{1}+\ldots+\beta_{p}x^{(i)}_{p})}} $

### $ 𝑝(𝐱) = \frac{1}{1 + e^{−𝑓(𝐱)}} $

### $ 𝑓(𝐱) = log \left( \frac{𝑝(𝐱)}{1 − 𝑝(𝐱)} \right) $
//-->

## 1.1 scikit-learn: Logistic Regression
LogisticRegression(**C=1.0**, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
- 'liblinear' solver doesn’t work without regularization.
- 'newton-cg', 'sag', 'saga', and 'lbfgs' don’t support L1 regularization.
- 'saga' is the only solver that supports elastic-net regularization.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

def plot_confusion_matrix(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    print(conf_m)
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm)
    ax.grid(False)
    ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
    ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
    ax.set_ylim(1.5, -0.5)
    for i in range(2):
        for j in range(2):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
    return

def eval_model(model, x, y):
    p_pred = model.predict_proba(x)
    y_pred = model.predict(x)
    score_ = model.score(x, y)
    report = classification_report(y, y_pred)
    print(p_pred, '\n', y_pred, '\n', score_, '\n', report)
    return

In [None]:
# Get data
x = np.arange(10).reshape(-1, 1)
print(x)

# y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
y = np.array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1])
print(y)

In [None]:
# Create a model and train it
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x, y)

# Evaluate the model
eval_model(model, x, y)
# plot_confusion_matrix(y, y_pred)

**Hyperparameter tuning: set C=10.0 for better prediction? default C=1.0**

In [None]:
# Create a model and train it
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(x, y)

# Evaluate the model
eval_model(model, x, y)
# plot_confusion_matrix(y, y_pred)

## 1.2 StatsModels: Logistic Regression¶

In [None]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import classification_report

# Get data
x = np.arange(10).reshape(-1, 1)
y = np.array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1])
x = sm.add_constant(x)

# Create a model and train it
model = sm.Logit(y, x)
result = model.fit(method='newton')

# Evaluate the model
result.predict(x)
print(result.pred_table())

**Report with StatsModels**

In [None]:
result.summary()
# result.summary2()

**Report with scikit-learn**

In [None]:
# Classification + report with scikit-learn
y_pred = (result.predict(x) >= 0.5).astype(int)
report = classification_report(y, y_pred)
print(report) 

## 1.3 AUC curve For Binary Classification with Breast Cancer Dataset

In [None]:
import pandas as pd
from sklearn import svm, datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44)
clf = LogisticRegression(penalty='l2', 
                         C=0.1, 
                         max_iter=5000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy', metrics.accuracy_score(y_test, y_pred))

cm = metrics.confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm, 
                   columns=['Predicted Benign', 'Predicted Malignant'], 
                   index=['Actual Benign', 'Actual Malignant']), '\n')

tn, fp, fn, tp = cm.ravel()
print('True Positives: ', tp, 
      'False Positives: ', fp, 
      'True Negatives: ', tn,
      'False Negatives: ', fn, '\n')

y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba)

auc = metrics.roc_auc_score(y_test, y_pred_proba)
print('FPR', fpr, '\n\n', 
      'TPR', tpr, '\n\n',
      'threshold', threshold, '\n\n', 
      'ROC-AUC', auc, '\n\n')

plt.plot(fpr, tpr, label="data 1, auc="+str(auc) )
plt.legend(loc=4)
plt.show()

# 2. Multiclass classification

## 2.1 One-vs-Rest

In [None]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
X = np.array([
    [10, 10],
    [8, 10],
    [-5, 5.5],
    [-5.4, 5.5],
    [-20, -20],
    [-15, -20]
])
y = np.array([0, 0, 1, 1, 2, 2])

clf = OneVsRestClassifier(SVC()).fit(X, y)
clf.predict([[-19, -20], [9, 9], [-5, 5]])

## 2.2 One-vs-One

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.33, 
    shuffle=True, 
    random_state=0)

clf = OneVsOneClassifier(
    LinearSVC(max_iter=10000, random_state=0)).fit(X_train, y_train)

clf.predict(X_test[:10])

## Notes: Feature selection - Embedded

Skombinovať výhody filtrov a wrapprov
- Model, ktorý sa trénuje si bude priamo vyberať atribúty, ktoré sú pre neho najlepšie

Len málo modelov to podporuje
* Lineárne modely penalizované L1 (Lasso) alebo L1+L2 (Elastic Net) regularizáciou: SVM, Lineárna regresia, Logistická regresia ...

- Regularizácia zavádza do modelu penalizáciu za počet / veľkosť váh atribútov modelu. Nie je tam len chyba predikcie. Prirodzene sa tak vyberá jednoduchší model.