In [1]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [3]:
feature = np.array([['가나다라'],
                    ['가나다라'],
                    ['아바하자'],
                    ['카나다사']])

one_hot = LabelBinarizer()
one_hot.fit_transform(feature)
print(one_hot.classes_)

['가나다라' '아바하자' '카나다사']


In [5]:
multiclass_feat = [('가나다라마', '아자바하나'),
                   ('아바다카다', '사바하마사'),
                   ('가나다라마', '아자바하나'),
                   ('가나다라마', '라마타카나'),
                   ('가나다라마', '아자바하나')]

one_hot_mult = MultiLabelBinarizer()
one_hot_mult.fit_transform(multiclass_feat)
print(one_hot_mult.classes_)

['가나다라마' '라마타카나' '사바하마사' '아바다카다' '아자바하나']


String Target data one-hot

In [7]:
from sklearn.preprocessing import OneHotEncoder

str_feat =  ([['안녕', 1],
              ['저녁', 2],
              ['안녕', 1],
              ['만남', 3]
              ])


one_hot_encoder= OneHotEncoder(sparse=False)

one_hot_encoder.fit_transform(str_feat)
print(one_hot_encoder.categories_)

[array(['만남', '안녕', '저녁'], dtype=object), array([1, 2, 3], dtype=object)]




Ordinal categorical data

In [8]:
import pandas as pd

df = pd.DataFrame({
    'Score' : ['Low', 'Low', 'High', 'High', 'Low', 'Medium']
})

scale_mapper = {
    "Low" : 1,
    "Medium" : 2,
    "High" : 3
}

In [9]:
data = df["Score"].replace(scale_mapper)
print(data)

0    1
1    1
2    3
3    3
4    1
5    2
Name: Score, dtype: int64


In [10]:
from sklearn.feature_extraction import DictVectorizer

In [19]:
data_dict = [{"Red":2, "Blue":4},
             {"Red":4, "Blue":3},
             {"Red":1, "Yellow":2},
             {"Red":1, "Yellow":2}
             ]

dictVec = DictVectorizer(sparse=False)
feat_dict = dictVec.fit_transform(data_dict)
print(feat_dict)

feat_dict_name = dictVec.get_feature_names_out()
print(feat_dict_name)

dict_data = pd.DataFrame(feat_dict, columns=feat_dict_name)
print(dict_data)

[[4. 2. 0.]
 [3. 4. 0.]
 [0. 1. 2.]
 [0. 1. 2.]]
['Blue' 'Red' 'Yellow']
   Blue  Red  Yellow
0   4.0  2.0     0.0
1   3.0  4.0     0.0
2   0.0  1.0     2.0
3   0.0  1.0     2.0


Categoricla Data - 누락된 class 값

In [26]:
from sklearn.neighbors import KNeighborsClassifier
x = np.array([[0,2.10, 1.48 ],
              [1,1.18, 1.33],
              [0,1.22,1.27],
              [1,-0.2,-1.15]])

x_with_nan = np.array([[np.nan,0.87,1.33], [np.nan,-0.67,-0.22]])
clf = KNeighborsClassifier(3, weights='distance')


In [27]:
train_model = clf.fit(x[:,1:], x[:,0])
imputed_values = train_model.predict(x_with_nan[:, 1:])

x_with_imputed = np.hstack((imputed_values.reshape(-1,1),x_with_nan[:,1:]))
data =np.vstack((x_with_imputed,x))

In [29]:
print(data)

[[ 0.    0.87  1.33]
 [ 1.   -0.67 -0.22]
 [ 0.    2.1   1.48]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]


In [33]:
from sklearn.impute import SimpleImputer

x_complete = np.vstack((x_with_nan,x))
print(x_complete)

impute = SimpleImputer(strategy='most_frequent')
data_impute = impute.fit_transform(x_complete)
print(data_impute)

[[  nan  0.87  1.33]
 [  nan -0.67 -0.22]
 [ 0.    2.1   1.48]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]
[[ 0.    0.87  1.33]
 [ 0.   -0.67 -0.22]
 [ 0.    2.1   1.48]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.2  -1.15]]
