# Imbalanced data

In [None]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = [12, 8]

sklearn.set_config(display='diagram')

## Load Mammography Data

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
mammography = fetch_openml(data_id=310)
X, y = mammography.data, mammography.target

In [None]:
y = (y == '1').astype(int)

In [None]:
np.bincount(y)

## Split data into train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

### Base models
#### Linear model

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

In [None]:
base_log_reg = LogisticRegression(random_state=42)
log_reg_scores = cross_validate(base_log_reg,
                                X_train, y_train, scoring=['roc_auc', 'average_precision'])

In [None]:
log_reg_scores

In [None]:
log_reg_base_auc = log_reg_scores['test_roc_auc'].mean()
log_reg_base_auc

In [None]:
log_reg_base_ap = log_reg_scores['test_average_precision'].mean()
log_reg_base_ap

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
base_rf = RandomForestClassifier(random_state=42)
rf_scores = cross_validate(base_rf, X_train, y_train, scoring=['roc_auc', 'average_precision'])

In [None]:
rf_base_auc = rf_scores['test_roc_auc'].mean()
rf_base_auc

In [None]:
rf_base_ap = rf_scores['test_average_precision'].mean()
rf_base_ap

### Imbalance-learn sampler

#### Under sampler

In [None]:
np.bincount(y_train)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
under_sampler = RandomUnderSampler(replacement=False)

In [None]:
X_train_subsample, y_train_subsample = under_sampler.fit_sample(
    X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_subsample.shape

In [None]:
np.bincount(y_train_subsample)

#### Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
over_sampler = RandomOverSampler()

In [None]:
X_train_subsample, y_train_subsample = over_sampler.fit_sample(
    X_train, y_train
)

In [None]:
X_train_subsample.shape

In [None]:
np.bincount(y_train_subsample)

## Pipelines with imblean

### Linear model with under sampling

In [None]:
from imblearn.pipeline import make_pipeline as make_imb_pipeline

In [None]:
under_log_reg = make_imb_pipeline(
    RandomUnderSampler(), LogisticRegression(random_state=42))

In [None]:
under_log_reg_scores = cross_validate(
    under_log_reg, X_train, y_train, cv=10,
    scoring=['roc_auc', 'average_precision']
)

In [None]:
log_reg_base_auc, log_reg_base_ap

In [None]:
under_log_reg_auc = under_log_reg_scores['test_roc_auc'].mean()
under_log_reg_auc

In [None]:
under_log_reg_ap = under_log_reg_scores['test_average_precision'].mean()
under_log_reg_ap

### Random Forest with under sampling

In [None]:
under_rf = make_imb_pipeline(
    RandomUnderSampler(), RandomForestClassifier(random_state=42))

In [None]:
under_rf_reg_scores = cross_validate(
    under_rf, X_train, y_train, cv=10,
    scoring=['roc_auc', 'average_precision']
)

In [None]:
rf_base_auc, rf_base_ap

In [None]:
under_rf_auc = under_rf_reg_scores['test_roc_auc'].mean()
under_rf_auc

In [None]:
under_rf_ap = under_rf_reg_scores['test_average_precision'].mean()
under_rf_ap

### Linear model with over sampling

In [None]:
over_log_reg = make_imb_pipeline(
    RandomOverSampler(), LogisticRegression(random_state=42))

In [None]:
over_log_reg_scores = cross_validate(
    over_log_reg, X_train, y_train, cv=10,
    scoring=['roc_auc', 'average_precision']
)

In [None]:
log_reg_base_auc, log_reg_base_ap

In [None]:
over_log_reg_auc = over_log_reg_scores['test_roc_auc'].mean()
over_log_reg_auc

In [None]:
over_log_reg_ap = over_log_reg_scores['test_average_precision'].mean()
over_log_reg_ap

## Exercise 1

1. Use `make_imb_pipeline` with `RandomOverSampler` to create a pipline with random forset called `over_rf`.
1. Use `cross_validate` to compute `over_rf_auc` and `over_rf_ap`.

In [None]:
# %load solutions/02-ex01-solutions.py

## Plotting curves for logistic regression

In [None]:
base_log_reg.fit(X_train, y_train)
under_log_reg.fit(X_train, y_train)
over_log_reg.fit(X_train, y_train);

In [None]:
base_log_reg.score(X_test, y_test)

### Plotting

In [None]:
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_log_reg, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_log_reg, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_log_reg, X_test, y_test, ax=ax1, name="oversampling")

plot_precision_recall_curve(base_log_reg, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_log_reg, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_log_reg, X_test, y_test, ax=ax2, name="oversampling");

## Exercise 2

1. Train the three random forest models, `base_rf`, `under_rf`, `over_rf`.
1. Plot the roc and precision recall for the three random forest models.

In [None]:
# %load solutions/02-ex02-solutions.py

## Class-Weights

#### Linear model with class weights

In [None]:
class_weight_log_reg = LogisticRegression(class_weight='balanced', random_state=42)
class_weight_log_reg.fit(X_train, y_train)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_log_reg, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(class_weight_log_reg, X_test, y_test, ax=ax1, name="class-weighted")

plot_precision_recall_curve(base_log_reg, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(class_weight_log_reg, X_test, y_test, ax=ax2, name="class-weighted")

#### Random forest with class weights 

In [None]:
class_weight_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
class_weight_rf.fit(X_train, y_train)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(class_weight_rf, X_test, y_test, ax=ax1, name="class-weighted")

plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(class_weight_rf, X_test, y_test, ax=ax2, name="class-weighted")

## Ensemble Resampling

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
balanced_rf = BalancedRandomForestClassifier(random_state=0)
balanced_rf.fit(X_train, y_train)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_rf, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_rf, X_test, y_test, ax=ax1, name="oversampling")
plot_roc_curve(balanced_rf, X_test, y_test, ax=ax1, name="balanced bagging")

plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_rf, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_rf, X_test, y_test, ax=ax2, name="oversampling");
plot_precision_recall_curve(balanced_rf, X_test, y_test, ax=ax2, name="balanced bagging")

### Comparing classification reports

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, under_rf.predict(X_test)))

In [None]:
print(classification_report(y_test, over_rf.predict(X_test)))

In [None]:
print(classification_report(y_test, base_rf.predict(X_test)))

In [None]:
print(classification_report(y_test, balanced_rf.predict(X_test)))

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)

X_train_smote.shape

In [None]:
np.bincount(y_train_smote)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
sorting = np.argsort(y_train)

axes[0].set_title("Original")
axes[0].scatter(X_train[sorting, 3], X_train[sorting, 4], c=plt.cm.tab10(y_train[sorting]), alpha=.3, s=2)

axes[1].set_title("SMOTE")
axes[1].scatter(X_train_smote[:, 3], X_train_smote[:, 4], c=plt.cm.tab10(y_train_smote), alpha=.1, s=2)

In [None]:
log_reg_base_auc, log_reg_base_ap

In [None]:
smote_log_reg = make_imb_pipeline(
    SMOTE(), LogisticRegression(random_state=42))

smote_log_reg_scores = cross_validate(smote_log_reg, X_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
smote_log_reg_auc = smote_log_reg_scores['test_roc_auc'].mean()
smote_log_reg_ap = smote_log_reg_scores['test_average_precision'].mean()

In [None]:
smote_log_reg_auc, smote_log_reg_ap

In [None]:
smote_rf = make_imb_pipeline(SMOTE(), RandomForestClassifier(random_state=42))
smote_rf_scores = cross_validate(smote_rf, X_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
smote_rf_auc = smote_rf_scores['test_roc_auc'].mean()
smote_rf_ap = smote_rf_scores['test_average_precision'].mean()

In [None]:
smote_rf_auc, smote_rf_ap

## Plotting all the version of random forest

In [None]:
smote_rf.fit(X_train, y_train)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8))
plot_roc_curve(base_rf, X_test, y_test, ax=ax1, name="original")
plot_roc_curve(under_rf, X_test, y_test, ax=ax1, name="undersampling")
plot_roc_curve(over_rf, X_test, y_test, ax=ax1, name="oversampling")
plot_roc_curve(balanced_rf, X_test, y_test, ax=ax1, name="balanced bagging")
plot_roc_curve(smote_rf, X_test, y_test, ax=ax1, name="smote")

plot_precision_recall_curve(base_rf, X_test, y_test, ax=ax2, name="original")
plot_precision_recall_curve(under_rf, X_test, y_test, ax=ax2, name="undersampling")
plot_precision_recall_curve(over_rf, X_test, y_test, ax=ax2, name="oversampling");
plot_precision_recall_curve(balanced_rf, X_test, y_test, ax=ax2, name="balanced bagging")
plot_precision_recall_curve(smote_rf, X_test, y_test, ax=ax2, name="smote")

## Exercise 3

1. Train a `HistGradientBoostingClassifer` on the training set.

```py
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifer
```

1. Construct a pipline with `SMOTE` and `HistGradientBoostingClassifer` fit it on the training set.
1. Plot the ROC and PR curves between the two models.

In [None]:
# %load solutions/02-ex03-solutions.py