#  Import libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, HTML

# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 130

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Label Encoding

In [74]:
from sklearn.preprocessing import LabelEncoder 

peng = sns.load_dataset("penguins")

label_encoder = LabelEncoder()

cols = ["sex", "species"]

for col in cols:
    peng.loc[:, col] = label_encoder.fit_transform(peng.loc[:, col])


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,1
1,0,Torgersen,39.5,17.4,186.0,3800.0,0
2,0,Torgersen,40.3,18.0,195.0,3250.0,0
3,0,Torgersen,,,,,2
4,0,Torgersen,36.7,19.3,193.0,3450.0,0
...,...,...,...,...,...,...,...
339,2,Biscoe,,,,,2
340,2,Biscoe,46.8,14.3,215.0,4850.0,0
341,2,Biscoe,50.4,15.7,222.0,5750.0,1
342,2,Biscoe,45.2,14.8,212.0,5200.0,0


# One hot encoding

## One hot encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

peng = sns.load_dataset("penguins")

cat_cols = peng.select_dtypes(include='object').columns.values

# Divide df with categories and df with numerical columns
cat_peng = peng[cat_cols].copy()
num_peng = peng[[col for col in peng.columns.values if col not in cat_cols]]

# Conduct one hot encoding
encoder = OneHotEncoder(handle_unknown="ignore")
cat_peng = encoder.fit_transform(cat_peng)

onehot = pd.DataFrame(cat_peng.toarray(), columns=encoder.get_feature_names(['species', 'island', 'sex']))

# Concatenate categoral columns and numerical columns
peng = pd.concat([onehot, num_peng], axis=1)


## Pandas dummies

In [8]:
peng = sns.load_dataset("penguins")


train, test = peng[10:], peng[:10]

train = pd.get_dummies(train, drop_first=True, dummy_na=True)
test = pd.get_dummies(test, drop_first=True, dummy_na=True)

# align = 整列する。axis=1 is column. Align columns by left join.
test, train = test.align(train, join='left', axis=1)

train.columns.values
test.columns.values

array(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'species_nan', 'island_nan', 'sex_Male', 'sex_nan'],
      dtype=object)

array(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'species_nan', 'island_nan', 'sex_Male', 'sex_nan'],
      dtype=object)

# Count encodings

In [34]:
peng = sns.load_dataset("penguins")

train = peng[100:]
test = peng[:100]


# Count encoding Way1 -------------------------------------------------------------------------------------
count_features = ['species', 'island']

for feature in count_features:
    train['count_' + feature] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test['count_' + feature] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))


Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64

Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64

# Target mean encodings

In [71]:
# Target Mean Encoding ---------------------------------------------------------------------------
train = pd.DataFrame({
    'feature1':[1,2,3], 
    'category1':["A","A","B"],
    'target':[1,0,1]
})
test = pd.DataFrame({'feature1':[1,2,3], 'category1':["A","B","B"],'target':[1,0,1]})


# http://nami3373.hatenablog.com/entry/2018/07/26/230655
train['target_enc'] = train.groupby('category1')['target'].transform('mean')

# Create table to map category to encoded target mean
table = train[["category1", "target_enc"]].drop_duplicates().set_index('category1')["target_enc"]

test['target_enc'] = test['category1'].map(table)

train
test

Unnamed: 0,feature1,category1,target,target_enc
0,1,A,1,0.5
1,2,A,0,0.5
2,3,B,1,1.0


Unnamed: 0,feature1,category1,target,target_enc
0,1,A,1,0.5
1,2,B,0,1.0
2,3,B,1,1.0


# Target encoding with smooth

# Cat encodings