# 第５章 カテゴリ変数に関する特徴量生成

## 初期処理

### Google Driveのマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### ライブラリのロード

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston

### ファイルの読込

#### 作業用のフォルダへ移動

In [3]:
cd /content/drive/My Drive/Colab Notebooks

/content/drive/My Drive/Colab Notebooks


#### ファイルの読込

In [0]:
dfTips = sns.load_dataset("tips")
dfTitanic = pd.read_csv("./data/train.csv")
boston = load_boston()

dfBoston = pd.DataFrame(boston.data, columns=boston.feature_names)
dfBoston["MEDV"] = boston.target

dfTips_back = dfTips.copy()
dfTitanic_back = dfTitanic.copy()
dfBoston_back = dfBoston.copy()

### Category Encodersのインストール・インポート

In [5]:
pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
[K     |████████████████████████████████| 102kB 2.2MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0


In [0]:
import category_encoders as ce


## 特徴量生成

### Label Encoding

In [7]:
dfTitanic = dfTitanic.drop(columns=["PassengerId","Name","Ticket"])
print( dfTitanic.head(3) )

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500   NaN        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250   NaN        S


In [8]:
ce_oe = ce.OrdinalEncoder(cols=["Sex", "Embarked"],handle_unknown='impute')
dfTitanic = ce_oe.fit_transform( dfTitanic )
print( dfTitanic.head(3) )

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare Cabin  Embarked
0         0       3    1  22.0      1      0   7.2500   NaN         1
1         1       1    2  38.0      1      0  71.2833   C85         2
2         1       3    2  26.0      0      0   7.9250   NaN         1


### Target Encoding

In [9]:
dfTitanic = dfTitanic_back.copy()
dfTitanic = dfTitanic.drop(columns=["PassengerId","Name","Ticket"])
print( dfTitanic.head(3) )

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500   NaN        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250   NaN        S


In [10]:
print( dfTitanic.groupby("Embarked").mean() )
print( "-----" )
print( dfTitanic.groupby("Sex").mean() )

          Survived    Pclass        Age     SibSp     Parch       Fare
Embarked                                                              
C         0.553571  1.886905  30.814769  0.386905  0.363095  59.954144
Q         0.389610  2.909091  28.089286  0.428571  0.168831  13.276030
S         0.336957  2.350932  29.445397  0.571429  0.413043  27.079812
-----
        Survived    Pclass        Age     SibSp     Parch       Fare
Sex                                                                 
female  0.742038  2.159236  27.915709  0.694268  0.649682  44.479818
male    0.188908  2.389948  30.726645  0.429809  0.235702  25.523893


In [11]:
ce_te = ce.TargetEncoder(cols=["Sex", "Embarked"],handle_unknown='impute')
dfTitanic = ce_te.fit_transform( dfTitanic, dfTitanic["Survived"] )
print( dfTitanic.head(3) )

   Survived  Pclass       Sex   Age  SibSp  Parch     Fare Cabin  Embarked
0         0       3  0.188908  22.0      1      0   7.2500   NaN  0.336957
1         1       1  0.742038  38.0      1      0  71.2833   C85  0.553571
2         1       3  0.742038  26.0      0      0   7.9250   NaN  0.336957


### Onehot Encoding

In [12]:
dfTitanic = dfTitanic_back.copy()
dfTitanic = dfTitanic.drop(columns=["PassengerId","Name","Ticket"])
print( dfTitanic.head(3) )

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500   NaN        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250   NaN        S


In [13]:
ce_ohe = ce.OneHotEncoder(cols=["Sex", "Embarked"],handle_unknown='impute')
dfTitanic = ce_ohe.fit_transform( dfTitanic )
print( dfTitanic.head(3) )

   Survived  Pclass  Sex_1  ...  Embarked_2  Embarked_3  Embarked_4
0         0       3      1  ...           0           0           0
1         1       1      0  ...           1           0           0
2         1       3      0  ...           0           0           0

[3 rows x 13 columns]


### Hash Encoding

In [14]:
dfTitanic = dfTitanic_back.copy()
dfTitanic = dfTitanic.drop(columns=["PassengerId","Name","Ticket"])
print( dfTitanic.head(3) )

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500   NaN        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250   NaN        S


In [15]:
ce_he = ce.HashingEncoder(cols=["Sex", "Embarked"])
dfTitanic = ce_he.fit_transform( dfTitanic )
print( dfTitanic.head(3) )

   col_0  col_1  col_2  col_3  col_4  ...   Age  SibSp  Parch     Fare  Cabin
0      0      0      0      0      0  ...  22.0      1      0   7.2500    NaN
1      0      0      0      0      0  ...  38.0      1      0  71.2833    C85
2      0      0      0      0      0  ...  26.0      0      0   7.9250    NaN

[3 rows x 15 columns]
