# Learning from Imbalanced Data
<br><img align="left" src="http://drive.google.com/uc?export=view&id=1DzKZ9ufrjHd9JtsoSvy8ayCsib54FFgr" width=800 height=600>

### Approaches
- **Sampling**
  - Random Undersampling
  - Random Oversampling
  - SMOTE
  - Tomek Links
  - SMOTE + Tomek Links
  - Using GAN(Generative Adversarial Networks)
- **Algorithms**
  - Cost-sensitive learning methods
  - Kernel-based methods

### Sampling heuristics
- Consider testing under-sampling when you have an a lot data (tens- or hundreds of thousands of instances or more)
- Consider testing over-sampling when you don’t have a lot of data (tens of thousands of records or less)
- Consider testing random and non-random (e.g. stratified) sampling schemes.
- Consider testing different resampled ratios (e.g. you don’t have to target a 1:1 ratio in a binary classification problem, try other ratios)

### Imbalanced-learn
- imbalanced data 문제를 해결하기 위한 다양한 샘플링 방법을 구현한 파이썬 패키지
- 설치방법: `pip install -U imbalanced-learn`

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from collections import Counter
from sklearn.svm import SVC

X00, y00 = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, 
                               n_classes=2, n_clusters_per_class=1, class_sep=0.8, 
                               weights=[0.99, 0.01], random_state=0)
X0, X_test, y0, y_test = train_test_split(X00, y00, test_size=0.25, random_state=12345)

x1_min = X00[:, 0].min() - 2
x1_max = X00[:, 0].max() + 2
x2_min = X00[:, 1].min() - 2
x2_max = X00[:, 1].max() + 2

def plot_samples(X=None, y=None):
    XX, YY = np.mgrid[x1_min:x1_max:300j, x2_min:x2_max:300j]
    params = {'kernel': 'linear'}
#    params = {'kernel': 'rbf', 'gamma': 1}
    if X is None:
        plt.figure(figsize=(7,7))
        X = X0
        model = SVC(**params).fit(X0, y0)
        Z = model.predict(np.c_[XX.ravel(), YY.ravel()])
        Z = Z.reshape(XX.shape)
        plt.contourf(XX, YY, Z, alpha=0.6)
        plt.scatter(X0[:, 0], X0[:, 1], marker='o', c=y0, s=40, 
                    linewidth=1, edgecolor='gray', alpha=0.7)
        plt.title(Counter(y0))
    else:
        plt.figure(figsize=(14,7))
        plt.subplot(121)
        model = SVC(**params).fit(X0, y0)
        Z = model.predict(np.c_[XX.ravel(), YY.ravel()])
        Z = Z.reshape(XX.shape)
        plt.contourf(XX, YY, Z, alpha=0.6)
        plt.scatter(X0[:, 0], X0[:, 1], marker='o', c=y0, s=40, 
                    linewidth=1, edgecolor='gray', alpha=0.7)
        plt.xlim(-2, 4)
        plt.ylim(-3, 4)
        plt.title(Counter(y0))
        plt.subplot(122)
        model = SVC(**params).fit(X, y)
        Z = model.predict(np.c_[XX.ravel(), YY.ravel()])
        Z = Z.reshape(XX.shape)
        plt.contourf(XX, YY, Z, alpha=0.6)
        plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, s=40, 
                    linewidth=1, edgecolor='gray', alpha=0.7)
        plt.xlim(-2, 4)
        plt.ylim(-3, 4)
        plt.title(Counter(y))
        plt.tight_layout()
    plt.show()
    
    return model.predict(X)
    
y_pred = plot_samples()
print(classification_report(y0, y_pred))

<font color = "#CC3D3D">
### SMOTE: an over-sampling class method
<br><img align="left" src="http://drive.google.com/uc?export=view&id=1K_oqSphPKgP7uWsjjbp9Gs6jEbYwzuzt" width=400 height=300>

In [None]:
from imblearn.over_sampling import *

X, y = SMOTE(random_state=0,k_neighbors=5).fit_sample(X0, y0)
y_pred = plot_samples(X, y)
print(classification_report(y, y_pred))

<font color = "#CC3D3D">
### Tomek links: a under-sampling class method
<br><img align="left" src="http://drive.google.com/uc?export=view&id=1ZbHsYSK1_SjXPM-rxgWy3PB2a1I-sfEz" width=500 height=300>    

In [None]:
from imblearn.under_sampling import *

X, y = TomekLinks(random_state=0, sampling_strategy='all').fit_sample(X0, y0)
y_pred = plot_samples(X, y)
print(classification_report(y, y_pred))

<font color = "#CC3D3D">
### SMOTE + Tomek

In [None]:
from imblearn.combine import *

X, y = SMOTETomek(random_state=0, sampling_strategy='all').fit_sample(X0, y0)
y_pred = plot_samples(X, y)
print(classification_report(y, y_pred))

##### Exercise

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits

In [None]:
# imbalanced data 생성
digits = load_digits()
y = digits.target == 9  # 숫자 9를 posive class로 설정

In [None]:
# 학습/평가 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=0)
X_train.shape, X_test.shape

In [None]:
# SMOTE + Tomek 방법으로 합성데이터 추가
sm = SMOTETomek(random_state=0)
X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
X_resampled.shape

In [None]:
# 원래 데이터를 사용했을 때의 모델성능
y_pred = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred), f1_score(y_test, y_pred)

In [None]:
# 합성 데이터를 추가했을 때의 모델성능
y_pred = RandomForestClassifier(random_state=0).fit(X_resampled, y_resampled).predict(X_test)
accuracy_score(y_test, y_pred), roc_auc_score(y_test, y_pred), f1_score(y_test, y_pred)

In [None]:
# Imbalanced learning 방법을 변경해보자. 

In [None]:
# Classification 알고리즘을 변경해보자.

In [None]:
# imbalance의 정도를 바꿔보자.