## 練習時間
參考 Day 12 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [27]:
# Import 需要的套件
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### 之前做過的處理

In [28]:
# 設定 data_path
dir_data = '../Part01/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

## 離散化AMT_INCOME_TOTAL

In [29]:
#等寬劃分
app_train['equal_width_INCOME'] = pd.cut(app_train['AMT_INCOME_TOTAL'],8)

In [30]:
app_train['equal_width_INCOME'].value_counts()

(-91324.35, 14647443.75]       307509
(102378206.25, 117000000.0]         1
(14647443.75, 29269237.5]           1
(87756412.5, 102378206.25]          0
(73134618.75, 87756412.5]           0
(58512825.0, 73134618.75]           0
(43891031.25, 58512825.0]           0
(29269237.5, 43891031.25]           0
Name: equal_width_INCOME, dtype: int64

In [31]:
#等頻劃分
app_train['equal_freq_INCOME'] = pd.qcut(app_train['AMT_INCOME_TOTAL'],8)

In [32]:
app_train['equal_freq_INCOME'].value_counts()

(147150.0, 180000.0]       61824
(25649.999, 90000.0]       57922
(112500.0, 135000.0]       48849
(90000.0, 112500.0]        42656
(261000.0, 117000000.0]    38184
(202500.0, 261000.0]       33354
(180000.0, 202500.0]       20389
(135000.0, 147150.0]        4333
Name: equal_freq_INCOME, dtype: int64

In [33]:
#自訂義
app_train['customized_INCOME'] = pd.cut(app_train['AMT_INCOME_TOTAL'],[25000,50000,100000,150000,200000,250000,300000,200000000])

In [34]:
app_train['customized_INCOME'].value_counts()

(100000, 150000]       91591
(150000, 200000]       64307
(50000, 100000]        59181
(200000, 250000]       48137
(300000, 200000000]    22739
(250000, 300000]       17039
(25000, 50000]          4517
Name: customized_INCOME, dtype: int64

In [35]:
#查看一下結果
diff = ['AMT_INCOME_TOTAL','equal_width_INCOME','equal_freq_INCOME','customized_INCOME']
app_train[diff].head(10)

Unnamed: 0,AMT_INCOME_TOTAL,equal_width_INCOME,equal_freq_INCOME,customized_INCOME
0,202500.0,"(-91324.35, 14647443.75]","(180000.0, 202500.0]","(200000, 250000]"
1,270000.0,"(-91324.35, 14647443.75]","(261000.0, 117000000.0]","(250000, 300000]"
2,67500.0,"(-91324.35, 14647443.75]","(25649.999, 90000.0]","(50000, 100000]"
3,135000.0,"(-91324.35, 14647443.75]","(112500.0, 135000.0]","(100000, 150000]"
4,121500.0,"(-91324.35, 14647443.75]","(112500.0, 135000.0]","(100000, 150000]"
5,99000.0,"(-91324.35, 14647443.75]","(90000.0, 112500.0]","(50000, 100000]"
6,171000.0,"(-91324.35, 14647443.75]","(147150.0, 180000.0]","(150000, 200000]"
7,360000.0,"(-91324.35, 14647443.75]","(261000.0, 117000000.0]","(300000, 200000000]"
8,112500.0,"(-91324.35, 14647443.75]","(90000.0, 112500.0]","(100000, 150000]"
9,135000.0,"(-91324.35, 14647443.75]","(112500.0, 135000.0]","(100000, 150000]"
