# Use validation data

In [1]:
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
### Set path
dir_data = '../../Lecture_data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

### Reading data
app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

### validation = 15% train

In [9]:
app_validation = app_train.sample(frac = 0.15)
app_train = app_train.drop(app_validation.index)
print(app_validation.shape)
print(app_train.shape)

ans_validation = app_validation['TARGET']

(46127, 122)
(261384, 122)


In [10]:
# label encode for object(str) <=2 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_count = 0

# check all columns
for col in app_train:
    if app_train[col].dtype == 'object':
        ### find object <= 2 into 0 and 1
        if len(list(app_train[col].unique())) <= 2:
            # Label Encoder
            le.fit(app_train[col])
            app_train[col] = le.transform(app_train[col])
            app_validation[col] = le.transform(app_validation[col])
            app_test[col] = le.transform(app_test[col])
            
            # recording
            le_count += 1
            
# One Hot Encoding            
app_train = pd.get_dummies(app_train)
app_validation = pd.get_dummies(app_validation)
app_test = pd.get_dummies(app_test)

In [11]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
#app_train['DAYS_EMPLOYED_ANOM'] = (app_train["DAYS_EMPLOYED"] == 365243)
app_train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
app_validation['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)
#app_test['DAYS_EMPLOYED_ANOM'] = (app_test["DAYS_EMPLOYED"] == 365243)
app_test["DAYS_EMPLOYED"].replace(365243, np.nan, inplace = True)


# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_validation['DAYS_BIRTH'] = abs(app_validation['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])

### 做好前處理
開始擬合模型之前，我們要確保 training & testing data 的欄位數量一致，原因是因為 One hot encoding 會製造多的欄位，有些類別出現在 training data 而沒有出現 testing data 中，我們就要把這些多餘的欄位去除

In [12]:
train_labels = app_train['TARGET']

# 調整欄位數, 移除出現在 training data 而沒有出現 testing data 中的欄位
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train, app_validation = app_train.align(app_validation, join = 'inner', axis = 1)

print(app_train.shape)
print(app_validation.shape)
print(app_test.shape)

(261384, 239)
(46127, 239)
(48744, 239)


In [57]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# 特徵欄位清單 and copy
train      = app_train.copy()
validation = app_validation.copy()
test       = app_test.copy()
features   = list(train.columns)

# 填補器 :  default=”mean” , 'median', 'most_frequent', 'constant'
#If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
#SimpleImputer(missing_values=nan, strategy=’mean’, fill_value=None, verbose=0, copy=True) 不一定要nan
imputer = SimpleImputer(strategy = 'mean')

# 填補器載入個欄中位數
imputer.fit(train)

# 回填 train, test 資料中的空缺值
train      = imputer.transform(train)
validation = imputer.transform(validation)
test       = imputer.transform(test)

# 縮放器 : 設定特徵縮放到 -1 ~ 1 區間 (比 0 ~ 1得到更多分數)
scaler = MinMaxScaler(feature_range = (-1, 1)) 

# scaler載入
scaler.fit(train)

# 縮放器載入 train 的上下限, 對 train, test 進行縮放轉換
train      = scaler.transform(train)
validation = scaler.transform(validation)
test       = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Vaildation data shape: ', validation.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (261384, 239)
Vaildation data shape:  (46127, 239)
Testing data shape:  (48744, 239)


### Use train data to compute any thing. [follow course](http://cs231n.github.io/neural-networks-2/)

### Fit the model

In [58]:
from sklearn.linear_model import LogisticRegression

# 設定模型與模型參數
log_reg = LogisticRegression(C = 0.0001, solver='lbfgs')

# 使用 Train 資料訓練模型
log_reg.fit(train, train_labels)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

模型 fit 好以後，就可以用來預測 testing data 中的客戶違約遲繳貸款的機率咯! (記得要用 predict_proba 才會輸出機率)

# Validation

In [59]:
log_reg_pred_vaildation = log_reg.predict_proba(validation)[:, 1]

print(log_reg_pred_vaildation)

[0.09478138 0.04687912 0.13150006 ... 0.06424039 0.04537949 0.07971607]


In [60]:
output = {'SK_ID_CURR': app_validation['SK_ID_CURR'], 'Prediction': log_reg_pred_vaildation, 'TARGET': ans_validation}
df_vaildation = pd.DataFrame(output)
print(df_vaildation.head(n = 10))

        SK_ID_CURR  Prediction  TARGET
296779      443839    0.094781       0
299723      447231    0.046879       0
96258       211753    0.131500       0
68421       179361    0.073573       0
70016       181219    0.151296       0
40908       147374    0.080282       0
2815        103285    0.106525       0
53995       162561    0.098802       0
293514      440044    0.070477       0
196916      328320    0.043697       0


In [61]:
# los function
def loss(t, y):
    return 0.5 * ((t - y)**2).sum()

In [62]:
loss_validation = loss(df_vaildation['Prediction'], df_vaildation['TARGET'])

print(loss_validation)

1662.013583343985


# Test

In [63]:
# 用模型預測結果
# 請注意羅吉斯迴歸是分類預測 (會輸出 0 的機率, 與 1 的機率), 而我們只需要留下 1 的機率這排
log_reg_pred = log_reg.predict_proba(test)[:, 1]

### 儲存預測結果

In [64]:
# 計算提交結果
output = {'SK_ID_CURR': app_test['SK_ID_CURR'], 'TARGET': log_reg_pred}
df = pd.DataFrame(output)
print(df.head())
#print(df.tail())

   SK_ID_CURR    TARGET
0      100001  0.058513
1      100005  0.147313
2      100013  0.052191
3      100028  0.063996
4      100038  0.130890


In [65]:
MyFile = '../data/hw_016_test1.csv'
df.to_csv(MyFile, index=None)

# Result

* strategy = 'median' , C = 0.0001, solver='lbfgs' => Score = 0.70579 (loss = 1662.2524)

* strategy = 'most_frequent' , C = 0.0001, solver='lbfgs' => Score = 0.70233 (loss = 1664.5252)

* strategy = 'mean' , C = 0.0001, solver='lbfgs' => Score = 0.70595 (loss = 1662.0135)