In [1]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import *
from imblearn.under_sampling import TomekLinks


In [2]:
def count_and_plot(y): 
    counter = Counter(y)
    print(counter)
    for k,v in counter.items():
        print('Class=%s, n=%d (%.3f%%)' % (k, v, v / len(y) * 100))
    plt.bar(counter.keys(), counter.values())
    plt.show()

In [3]:
# random over sampling 방법
def random_oversampling1(path):
    df = pd.read_csv(path)
    data = df.values
    X, y = data[:, 1], data[:, -1]
    X = X.reshape((4277, 1))
    count_and_plot(y)
    
    # oversampling 시작
    X_resampled, y_resampled = RandomOverSampler(random_state=0).fit_resample(X, y)
    count_and_plot(y_resampled)
    return [X_resampled, y_resampled]

In [4]:
# smote 알고리즘 방법
def smote_oversampling(path):
    df = pd.read_csv(path)
    data = df.values
    X, y = data[:, 1], data[:, -1]
    
    for i in range(len(X)):
        X[i] = X[i][:5]
        
    X = np.array(X, dtype = np.float64)
    X = X.reshape((4277, 1))
    
    count_and_plot(y)
    
    # oversampling 시작
    X_resampled, y_resampled = SMOTE(random_state=2,k_neighbors=3).fit_resample(X, y)
    count_and_plot(y_resampled)
    return [X_resampled, y_resampled]

In [1]:
def smotetomek_oversampling(path):
    df = pd.read_csv(path)
    data = df.values
    X, y = data[:, 1], data[:, -1]
    
    for i in range(len(X)):
        X[i] = X[i][:5]
        
    X = np.array(X, dtype = np.float64)
    X = X.reshape((4277, 1))
    
    count_and_plot(y)
    
    # oversampling 시작
    X_resampled, y_resampled = SMOTETomek(random_state=0, smote = SMOTE(k_neighbors=3)).fit_resample(X, y)
    count_and_plot(y_resampled)
    return [X_resampled, y_resampled]

In [6]:
def smoteenn_oversampling(path):
    df = pd.read_csv(path)
    data = df.values
    X, y = data[:, 1], data[:, -1]
    
    for i in range(len(X)):
        X[i] = X[i][:5]
        
    X = np.array(X, dtype = np.float64)
    X = X.reshape((4277, 1))
    
    count_and_plot(y)
    
    # oversampling 시작
    X_resampled, y_resampled = SMOTEENN(random_state=0, smote = SMOTE(k_neighbors=3)).fit_resample(X, y)
    count_and_plot(y_resampled)
    return [X_resampled, y_resampled]