## 練習時間
參考 Day 12 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [1]:
# Import 需要的套件
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### 之前做過的處理

In [87]:
# 設定 data_path
dir_data = '../../data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            # le.transform() ==> Transform Categories Into Integers (把分類轉為數字)
            app_train[col] = le.transform(app_train[col]) 
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
# pd.get_dummies() ==> Using the get_dummies will create a new column for every unique string in a certain column      
# 使用 get_dummies 進行 one-hot 编码 (https://blog.csdn.net/lujiandong1/article/details/52836051)
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous(異常的) flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

In [89]:
print('[YEARS_BIRTH]')
app_train['YEARS_BIRTH'] = app_train.loc[:,'DAYS_BIRTH'] / 365
# print(app_train.groupby('YEARS_BIRTH').size()) # between 20 to 70

print('')
print ('**** By equal')
print(pd.cut(app_train['YEARS_BIRTH'], 20).value_counts().sort_index())  

print('')
print ('**** By freq')
print(pd.qcut(app_train['YEARS_BIRTH'], 20).value_counts().sort_index()) 

print('')
print ('**** By customized')
print(pd.cut(app_train['YEARS_BIRTH'], [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]).value_counts().sort_index())  

print('')
print('Large of ages gaither between 25 ~ 65')

[YEARS_BIRTH]

**** By equal
(20.469, 22.948]     4053
(22.948, 25.378]     9626
(25.378, 27.808]    14032
(27.808, 30.238]    19095
(30.238, 32.668]    19730
(32.668, 35.099]    18700
(35.099, 37.529]    20221
(37.529, 39.959]    21537
(39.959, 42.389]    20532
(42.389, 44.819]    19818
(44.819, 47.249]    17660
(47.249, 49.679]    16651
(49.679, 52.11]     16059
(52.11, 54.54]      17485
(54.54, 56.97]      16830
(56.97, 59.4]       15820
(59.4, 61.83]       14856
(61.83, 64.26]      12829
(64.26, 66.69]       8524
(66.69, 69.121]      3453
Name: YEARS_BIRTH, dtype: int64

**** By freq
(20.517, 25.773]    15381
(25.773, 28.175]    15371
(28.175, 30.151]    15391
(30.151, 32.038]    15369
(32.038, 34.008]    15372
(34.008, 36.0]      15373
(36.0, 37.764]      15375
(37.764, 39.496]    15384
(39.496, 41.29]     15381
(41.29, 43.151]     15364
(43.151, 45.101]    15391
(45.101, 47.178]    15373
(47.178, 49.416]    15373
(49.416, 51.74]     15360
(51.74, 53.923]     15390
(53.923, 56.093

In [91]:
print('[OWN_CAR_AGE] ==> Age of client\'s car')
# print(app_train.groupby('OWN_CAR_AGE').size()) 

# 去掉異常值
app_train['OWN_CAR_AGE_ANOM'] = app_train["OWN_CAR_AGE"] == 91
app_train['OWN_CAR_AGE'].replace({91: np.nan}, inplace = True)
# print(app_train.groupby('OWN_CAR_AGE').size()) 

print('')
print ('**** By equal')
print(pd.cut(app_train['OWN_CAR_AGE'], 5).value_counts().sort_index())  

print('')
print ('**** By freq')
print(pd.qcut(app_train['OWN_CAR_AGE'], 5).value_counts().sort_index()) 

print('')
print ('**** By customized')
print(pd.cut(app_train['OWN_CAR_AGE'], [0, 5, 10, 15, 20, 30, 40, 60, 70]).value_counts().sort_index())  

print('')
print('Almost large part of quantities appear before 20')

[OWN_CAR_AGE] ==> Age of client's car

**** By equal
(-0.069, 13.8]    71291
(13.8, 27.6]      27094
(27.6, 41.4]       2729
(41.4, 55.2]        127
(55.2, 69.0]       3339
Name: OWN_CAR_AGE, dtype: int64

**** By freq
(-0.001, 4.0]    25193
(4.0, 7.0]       17401
(7.0, 12.0]      24131
(12.0, 17.0]     18994
(17.0, 69.0]     18861
Name: OWN_CAR_AGE, dtype: int64

**** By customized
(0, 5]      26654
(5, 10]     29519
(10, 15]    21158
(15, 20]    12063
(20, 30]     8122
(30, 40]     1406
(40, 60]      187
(60, 70]     3337
Name: OWN_CAR_AGE, dtype: int64

Almost large part of quantities appear before 20


In [117]:
print('[CNT_FAM_MEMBERS] ==> How many family members does client have')
# print(app_train.groupby('CNT_FAM_MEMBERS').size()) 

# 去掉異常值
app_train['CNT_FAM_MEMBERS_ANOM'] = app_train["CNT_FAM_MEMBERS"] == 20
app_train['CNT_FAM_MEMBERS'].replace({20: np.nan}, inplace = True)
# print(app_train.groupby('CNT_FAM_MEMBERS').size()) 


print('')
print ('**** By equal')
print(pd.cut(app_train['CNT_FAM_MEMBERS'], 7).value_counts().sort_index())  

print('')
print ('**** By freq')
print(pd.qcut(app_train['CNT_FAM_MEMBERS'], 2).value_counts().sort_index()) 

print('')
print ('**** By customized')
print(pd.cut(app_train['CNT_FAM_MEMBERS'], [0, 3, 4, 5, 16]).value_counts().sort_index())  

print('')
print('Almost large part of quantities appear before 3')

[CNT_FAM_MEMBERS] ==> How many family members does client have

**** By equal
(0.985, 3.143]      278805
(3.143, 5.286]       28175
(5.286, 7.429]         489
(7.429, 9.571]          26
(9.571, 11.714]          4
(11.714, 13.857]         3
(13.857, 16.0]           5
Name: CNT_FAM_MEMBERS, dtype: int64

**** By freq
(0.999, 2.0]    226204
(2.0, 16.0]      81303
Name: CNT_FAM_MEMBERS, dtype: int64

**** By customized
(0, 3]     278805
(3, 4]      24697
(4, 5]       3478
(5, 16]       527
Name: CNT_FAM_MEMBERS, dtype: int64

Almost large part of quantities appear before 3
