## **OneHotEncoding 01**

In [1]:
### 01. 데이터 준비
import pandas as pd
data = { "eng": ["b", "c", "a", "d"] }
df = pd.DataFrame(data)
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,eng
0,b
1,c
2,a
3,d


In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
en_x = LabelEncoder()
df['라벨인코딩'] = en_x.fit_transform(df['eng'])
df

Unnamed: 0,eng,라벨인코딩
0,b,1
1,c,2
2,a,0
3,d,3


In [4]:
df['라벨인코딩'].values

array([1, 2, 0, 3])

In [5]:
onehot = OneHotEncoder()
val = df['라벨인코딩'].values.reshape(-1,1) # OneHotEncoder()를 사용을 위한 적합한 값으로 변경.
y = onehot.fit_transform( val ).toarray()    # 값을 변경후, 배열로 만들어준다.
y

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [6]:
onehot_val = pd.DataFrame(y, dtype=int)
onehot_val

Unnamed: 0,0,1,2,3
0,0,1,0,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1


In [7]:
df_new = pd.concat([df, onehot_val], axis=1)
df_new

Unnamed: 0,eng,라벨인코딩,0,1,2,3
0,b,1,0,1,0,0
1,c,2,0,0,1,0
2,a,0,1,0,0,0
3,d,3,0,0,0,1


In [8]:
data = { "회사명": ["MS","Apple", "Google", "Google"]}
df1 = pd.DataFrame(data)
df2 = df1.copy()
df2

Unnamed: 0,회사명
0,MS
1,Apple
2,Google
3,Google


In [9]:
### OneHotEncoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

df1['회사명']

0        MS
1     Apple
2    Google
3    Google
Name: 회사명, dtype: object

In [10]:
### LabelEncoder
encoder_x = LabelEncoder()
df1['lbl_en'] = encoder_x.fit_transform(df1['회사명'])  # 
df1

Unnamed: 0,회사명,lbl_en
0,MS,2
1,Apple,0
2,Google,1
3,Google,1


In [11]:
df1['lbl_en'].values

array([2, 0, 1, 1])

In [12]:
onehot = OneHotEncoder()
y = onehot.fit_transform(df1['lbl_en'].values.reshape(-1,1) ).toarray()
print(y)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [13]:
# 변경된 값을 DataFrame형태로 변경
dx = pd.DataFrame(y, dtype=int)
dx

Unnamed: 0,0,1,2
0,0,0,1
1,1,0,0
2,0,1,0
3,0,1,0


In [14]:
df1_new = pd.concat([df1, dx], axis=1)
df1_new

Unnamed: 0,회사명,lbl_en,0,1,2
0,MS,2,0,0,1
1,Apple,0,1,0,0
2,Google,1,0,1,0
3,Google,1,0,1,0


In [15]:
from tensorflow.keras.utils import to_categorical
import numpy as np

# define example
data = [15,17,5,10,0]
dat = np.array(data)
print(dat)

# one hot encode
encoded = to_categorical(dat)
print(encoded)

[15 17  5 10  0]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [16]:
# invert encoding
inverted = np.argmax(encoded[1])
print(inverted)

17


In [17]:
import pandas as pd
import os

demo_df = pd.DataFrame({"범주형_feature":['양말', '여우', '양말', '상자']})
display(demo_df)

Unnamed: 0,범주형_feature
0,양말
1,여우
2,양말
3,상자


In [18]:
onehot = pd.get_dummies(demo_df)
onehot

Unnamed: 0,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0


In [19]:
df = pd.concat([demo_df, onehot], axis=1)
df

Unnamed: 0,범주형_feature,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,양말,0,1,0
1,여우,0,0,1
2,양말,0,1,0
3,상자,1,0,0


## **OneHotEncoding 02**

In [20]:
import mglearn
import pandas as pd
import os

demo_df = pd.DataFrame({"Product":['양말', '여우', '양말', '상자']})
display(demo_df)

Unnamed: 0,Product
0,양말
1,여우
2,양말
3,상자


In [21]:
onehot = pd.get_dummies(demo_df)
onehot

Unnamed: 0,Product_상자,Product_양말,Product_여우
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0


In [22]:
df = pd.concat([demo_df, onehot], axis=1)
df

Unnamed: 0,Product,Product_상자,Product_양말,Product_여우
0,양말,0,1,0
1,여우,0,0,1
2,양말,0,1,0
3,상자,1,0,0


In [23]:
path = os.path.join(mglearn.datasets.DATA_PATH, 'adult.data')
print(path)

D:\Anaconda\lib\site-packages\mglearn\data\adult.data


In [24]:
data = pd.read_csv(path,
               header=None, 
               index_col=False,
     names=['age', 'workclass', 'fnlwgt', 'education', 
            'education-num', 'marital-status', 'occupation', 'relationship', 
            'race', 'gender', 'capital-gain', 'capital-loss', 
            'hours-per-week', 'native-country', 'income'])

In [25]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [26]:
sel = ['age', 'workclass','education','gender','hours-per-week',
       'occupation','income']
data = data[sel]
data.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [27]:
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


In [28]:
print("원본 특성 :\n", list(data.columns), "\n")
data_dummies = pd.get_dummies(data)
print("get_dummies 후 특성 : \n", list(data_dummies.columns))

원본 특성 :
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

get_dummies 후 특성 : 
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation

In [29]:
features = data_dummies.loc[:, "age":"occupation_ Transport-moving"]
X = features.values
y = data_dummies['income_ >50K'].values


print("X.shape : {}, y.shape : {}".format(X.shape, y.shape))

X.shape : (32561, 44), y.shape : (32561,)


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("테스트 점수 {:.2f}".format(logreg.score(X_test, y_test)))

테스트 점수 0.81


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## **OneHotEncoding 03**

In [31]:
### 01. 데이터 준비
import pandas as pd
data = { "feature1":[2,3,8,4],
         "feature2":[22,32,82,42],
         "target": ["b","c", "a", "d"]
       }
df = pd.DataFrame(data)
df

Unnamed: 0,feature1,feature2,target
0,2,22,b
1,3,32,c
2,8,82,a
3,4,42,d


In [32]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['lbl_en'] = label_encoder.fit_transform(df['target'])
df

Unnamed: 0,feature1,feature2,target,lbl_en
0,2,22,b,1
1,3,32,c,2
2,8,82,a,0
3,4,42,d,3


In [33]:
print( len(df) )
print( df['lbl_en'].values.shape )

4
(4,)


In [34]:
train_y = df['lbl_en'].values.reshape(len(df), 1)
print(train_y.shape)
train_y

(4, 1)


array([[1],
       [2],
       [0],
       [3]])

In [35]:
# 원핫 인코딩 수행
onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
train_y_onehot = onehot_encoder.fit_transform(train_y)
print(train_y_onehot)
print(train_y_onehot.shape)

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]
(4, 4)


In [36]:
# 원래 df에 원핫 인코딩 한 내용을 열기준으로 붙이기
onehot_val = pd.DataFrame(train_y_onehot, dtype=int)
df_new = pd.concat([df, onehot_val], axis=1)
df_new

Unnamed: 0,feature1,feature2,target,lbl_en,0,1,2,3
0,2,22,b,1,0,1,0,0
1,3,32,c,2,0,0,1,0
2,8,82,a,0,1,0,0,0
3,4,42,d,3,0,0,0,1


In [37]:
# 라이브러리 불러오기 
import numpy as np
from numpy import argmax   # 가장 값이 큰 인덱스 반환
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [38]:
data = ['spring', 'spring', 'summer', 'spring', 'autumn', 
        'autumn', 'winter', 'spring', 'summer', 'autumn']
values = np.array(data)
print(values)

# 라벨 인코딩 수행 - 범주형 문자를 정수로 바꾸기
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(values)
print(label_encoded)

# 원핫 인코딩 수행 - 범주형 문자를 0,1로 이루어진 벡터로 변경
print( label_encoded.shape ) # 1차원
onehot_encoder = OneHotEncoder(sparse=False)
lbl_encoded = label_encoded.reshape(len(label_encoded), 1)   
print( lbl_encoded.shape ) # 2차원

onehot_encoded = onehot_encoder.fit_transform(lbl_encoded)
print(onehot_encoded)

['spring' 'spring' 'summer' 'spring' 'autumn' 'autumn' 'winter' 'spring'
 'summer' 'autumn']
[1 1 2 1 0 0 3 1 2 0]
(10,)
(10, 1)
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]


In [39]:
print( np.unique(values) )
print( onehot_encoded )
print( onehot_encoded[4] )    # 5번째 값 [1,0,0,0]

# 5번째 값 중에 가장 높은 값을 갖는 인덱스 확인 
argmax(onehot_encoded[4, :])  # 5번째 값중에 [1]이 가장 크므로 인덱스 0 반환

['autumn' 'spring' 'summer' 'winter']
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]
[1. 0. 0. 0.]


0

In [40]:
# LabelEncoder에 입력하여 역변환 4번째 행의 값을 되돌리기
max_idx = [argmax(onehot_encoded[4, :])]
inverted = label_encoder.inverse_transform(max_idx)    # 만약 max_idx가 1이면 spring이 될 것이다.
print(inverted)

['autumn']


In [41]:
df = pd.DataFrame({"season":data, "lbl_season":label_encoded }, dtype=int)
onehot_val = pd.DataFrame(onehot_encoded, dtype=int)
onehot_val
df_new = pd.concat([df, onehot_val], axis=1)
df_new

Unnamed: 0,season,lbl_season,0,1,2,3
0,spring,1,0,1,0,0
1,spring,1,0,1,0,0
2,summer,2,0,0,1,0
3,spring,1,0,1,0,0
4,autumn,0,1,0,0,0
5,autumn,0,1,0,0,0
6,winter,3,0,0,0,1
7,spring,1,0,1,0,0
8,summer,2,0,0,1,0
9,autumn,0,1,0,0,0


In [42]:
import numpy as np
from numpy import argmax
# define input string
data = 'hello world'
print(data)

hello world


In [43]:
# define universe of possible input values
alphabet = 'abcdefghijklmnopqrstuvwxyz '
# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

print("char_to_int : ", char_to_int)
print()
print("int_to_char : ", char_to_int)

char_to_int :  {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, ' ': 26}

int_to_char :  {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, ' ': 26}


In [44]:
# integer encode input data
integer_encoded = [char_to_int[char] for char in data]
print(integer_encoded)

[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]


In [45]:
# one hot encode
onehot_encoded = list()
for value in integer_encoded:
    letter = [0 for _ in range(len(alphabet))]
    letter[value] = 1
    onehot_encoded.append(letter)
    
print(onehot_encoded)

[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [46]:
# invert encoding
inverted = int_to_char[argmax(onehot_encoded[0])]
print(inverted)

h
