In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### pre. 데이터 추가하기 (ADD DATA -> amex-default-prediction)
* 30G인 원본 데이터는 불러와서 사용할 수가 없다.
* Parquet format을 사용해서 5G인 데이터로 용량을 줄여 고용량 데이터를 다뤄보았다.


#### 빅데이터 파일 형식 : Avro / Parquet / ORC
* 세 가지 형식 모두 압축을 제공하고, 기계가 읽을 수 있는 바이너리 형식이다.
* Parquet와 ORC는 열에 데이터를 저장, 읽기가 많은 분석 워크로드에 최적화 되어 있고,
* Avro는 행 기반 데이터 저장 방식으로 쓰기가 많은 트랜잭션 워크로드에 적합하다.

### 01. 라이브러리 불러오기

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler     #라벨 인코더, 정규화방법
from sklearn.model_selection import train_test_split            #데이터 나누기
from sklearn.metrics import classification_report,confusion_matrix

In [4]:
import xgboost as xgb                  #그레디언부스팅 발전된 형태
from xgboost import XGBClassifier

import warnings, gc
warnings.filterwarnings("ignore")      #워닝메시지 숨기기

### 02. 데이터 불러오기

In [5]:
%%time
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")

고객ID를 기준으로 label을 펼친 것. 

In [6]:
train = train.merge(label,how='inner',on="customer_ID")

### 03. TRAIN 데이터셋 customer_ID로 라벨 인코딩

In [7]:
lab = LabelEncoder()
train['customer_ID']= lab.fit_transform(train['customer_ID'])

* train.shape = 5531451, 191

In [8]:
%%time
train = train.groupby(['customer_ID']).tail(1).set_index('customer_ID')

### 04. TEST 데이터 셋 불러오기

In [9]:
%%time
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")

### 05. TEST 데이터 셋도 customer_ID로 라벨 인코딩

In [10]:
test['customer_ID']= lab.fit_transform(test['customer_ID'])

In [11]:
test = test.groupby(['customer_ID']).tail(1).set_index('customer_ID')

### 06. 데이터 나누기 및 결측치 처리

In [12]:
y = train.target
X = train.drop(["target","S_2"],axis=1)
test = test.drop(['S_2'],axis=1)

X = X.fillna(-123)
test = test.fillna(-123)

In [13]:
y.value_counts()

* 34만명 정도가 파산 가능성이 없고, 11만명 정도가 파산 가능성이 있다.

In [14]:
cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']

num_cols = [col for col in X.columns if col not in cat_cols]

all_cols = [cat_cols,num_cols]

In [15]:
D_n_cols = [col for col in num_cols if col.startswith("D")]
S_n_cols = [col for col in num_cols if col.startswith("S")]
P_n_cols = [col for col in num_cols if col.startswith("P")]
B_n_cols = [col for col in num_cols if col.startswith("B")]
R_n_cols = [col for col in num_cols if col.startswith("R")]
D_c_cols = [col for col in cat_cols if col.startswith("D")]
B_c_cols = [col for col in cat_cols if col.startswith("B")] 

In [16]:
len(D_n_cols),  len(S_n_cols), len(P_n_cols), len(B_n_cols),len(R_n_cols), len(D_c_cols),len(B_c_cols)

### 07. 변수별 컬럼명 확인 및 새로운 변수 생성

In [17]:
%%time 
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]

X_num_agg_S = X.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
X_num_agg_S.columns = ['_'.join(x) for x in X_num_agg_S.columns]

X_num_agg_P = X.groupby("customer_ID")[P_n_cols].agg(['mean','min','max' ,'last'])
X_num_agg_P.columns = ['_'.join(x) for x in X_num_agg_P.columns]

X_num_agg_B = X.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
X_num_agg_B.columns = ['_'.join(x) for x in X_num_agg_B.columns]

X_num_agg_R = X.groupby("customer_ID")[R_n_cols].agg(['mean','min','last'])
X_num_agg_R.columns = ['_'.join(x) for x in X_num_agg_R.columns]

X_cat_agg_D = X.groupby("customer_ID")[D_c_cols].agg([ 'count','last','first','nunique'])
X_cat_agg_D.columns = ['_'.join(x) for x in X_cat_agg_D.columns]

X_cat_agg_B = X.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
X_cat_agg_B.columns = ['_'.join(x) for x in X_cat_agg_B.columns]

X = pd.concat([X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B], axis=1)
del X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B
_ = gc.collect()

print('X shape after engineering', X.shape)

* 190개의 새로운 컬럼이 생성되어 576개가 되었다.

In [18]:
%%time 
test_num_agg_D = test.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
test_num_agg_D.columns = ['_'.join(x) for x in test_num_agg_D.columns]

test_num_agg_S = test.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
test_num_agg_S.columns = ['_'.join(x) for x in test_num_agg_S.columns]

test_num_agg_P = test.groupby("customer_ID")[P_n_cols].agg(['mean','min','max', 'last'])
test_num_agg_P.columns = ['_'.join(x) for x in test_num_agg_P.columns]

test_num_agg_B = test.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
test_num_agg_B.columns = ['_'.join(x) for x in test_num_agg_B.columns]

test_num_agg_R = test.groupby("customer_ID")[R_n_cols].agg(['mean','min', 'last'])
test_num_agg_R.columns = ['_'.join(x) for x in test_num_agg_R.columns]

test_cat_agg_D = test.groupby("customer_ID")[D_c_cols].agg(['count','first', 'last','nunique'])
test_cat_agg_D.columns = ['_'.join(x) for x in test_cat_agg_D.columns]

test_cat_agg_B = test.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
test_cat_agg_B.columns = ['_'.join(x) for x in test_cat_agg_B.columns]

test = pd.concat([test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B], axis=1)
del test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B
_ = gc.collect()

print('Test shape after engineering', test.shape)

In [19]:
xgb_parms ={
    'booster': 'dart',
     'n_jobs':4,
     'n_estimators':1000,
    'lambda': 4.091409953463271e-08,
    'alpha': 3.6353429991712695e-08,
    'subsample': 0.6423675532438815,
    'colsample_bytree': 0.7830450413657872,
    'max_depth': 9,
    'min_child_weight': 5,
    'eta': 0.3749337530972536,
    'gamma': 0.0745370910451703,
    'grow_policy': 'depthwise',
    'sample_type': 'uniform',
    'normalize_type': 'tree',
    'rate_drop': 0.0723975209176045,
    'skip_drop': 0.9026367296518939}

### 08. 데이터 나누기 및 학습, 평가

In [20]:
X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.25,stratify=y)

In [21]:
my_model = XGBClassifier(**xgb_parms)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=10, 
             eval_set=[(X_valid, y_valid)],
             verbose=1)

In [22]:
# 훈련시키기
pred_val = my_model.predict(X_valid)

In [23]:
# 제출용 데이터 만들
pred_test = my_model.predict_proba(test)
preds = pd.DataFrame(pred_test)
pred_final = np.array(preds[1])
pred_final

In [24]:
submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

In [25]:
submission['prediction']=pred_final
submission

In [26]:
submission.to_csv("submission.csv",index=False)

### AMEX 제출 SCORE : 0.764