## OneHotEncoder

In [1]:
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = {'Feature1': ['A', 'B', 'A', 'C'],
        'Feature2': ['High', 'Low', 'Medium', 'Low']}

In [3]:
import pandas as pd

In [4]:
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,Feature1,Feature2
0,A,High
1,B,Low
2,A,Medium
3,C,Low


In [5]:
df_data.index, df_data.columns, df_data.values

(RangeIndex(start=0, stop=4, step=1),
 Index(['Feature1', 'Feature2'], dtype='object'),
 array([['A', 'High'],
        ['B', 'Low'],
        ['A', 'Medium'],
        ['C', 'Low']], dtype=object))

In [6]:
type(df_data.values)

numpy.ndarray

In [7]:
oneHotEncoder = OneHotEncoder() # 인스턴스화

In [8]:
oneHotEncoder.fit(df_data[['Feature1']]) # 교육(학습)

In [9]:
oneHotEncoder.categories_

[array(['A', 'B', 'C'], dtype=object)]

In [10]:
encoder_array = oneHotEncoder.transform(df_data[['Feature1']]).toarray()

# 'A','B','A','C'가 벡터로 변경됨

In [11]:
# oneHotEncoder.get_feature_names_out(['Feature1'])

In [12]:
# df_encoder = pd.DataFrame(encoder_array, columns=['A', 'B', 'C'])
df_encoder = pd.DataFrame(encoder_array, columns=oneHotEncoder.get_feature_names_out(['Feature1']))
df_encoder

Unnamed: 0,Feature1_A,Feature1_B,Feature1_C
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0


In [13]:
pd.concat([df_data, df_encoder], axis=1)

Unnamed: 0,Feature1,Feature2,Feature1_A,Feature1_B,Feature1_C
0,A,High,1.0,0.0,0.0
1,B,Low,0.0,1.0,0.0
2,A,Medium,1.0,0.0,0.0
3,C,Low,0.0,0.0,1.0


## Imbalanced Data sampling

### under sampling : Tomek's Link

In [16]:
# 설치방법
# %conda install imbalanced-learn

In [17]:
# TomekLinks : under_sampling에 사용됨
from imblearn.under_sampling import TomekLinks

In [18]:
from sklearn.datasets import make_classification

In [19]:
# under_sampling할 data 만들기
features, target = make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [21]:
features.shape, target.shape

# 설명변수(feature):1000개의 데이터, 변수 20개

((1000, 20), (1000,))

In [23]:
# Numpy에서 사용하는 count를 쉽게 사용할 수 있음
from collections import Counter

In [25]:
# target의 unbalance확인
Counter(target)
# 0의범주:100개, 1의범주:900개 -> 데이터 unbalance

Counter({0: 100, 1: 900})

In [26]:
tomekLinks = TomekLinks() # 인스턴스화
features_resample, target_resample = tomekLinks.fit_resample(features, target) # 교육

In [27]:
features_resample.shape, target_resample.shape

((997, 20), (997,))

In [28]:
Counter(target_resample)

Counter({0: 100, 1: 897})