In [92]:
import pandas as pd
import numpy as np

In [93]:
test = pd.read_csv('test.csv', encoding = 'cp949')
train = pd.read_csv('train.csv', encoding = 'cp949')
sub = pd.read_csv('sample_submission.csv', encoding = 'cp949')


- gender: 성별

- car: 차량 소유 여부

- reality: 부동산 소유 여부

- child_num: 자녀 수

- income_total: 연간 소득

- income_type: 소득 분류
    - ['Commercial associate', 'Working', 'State servant', 'Pensioner', 'Student']

- edu_type: 교육 수준
    - ['Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary', 'Academic degree']

- family_type: 결혼 여부
    - ['Married', 'Civil marriage', 'Separated', 'Single / not married', 'Widow']

- house_type: 생활 방식
    - ['Municipal apartment', 'House / apartment', 'With parents', 'Co-op apartment', 'Rented apartment', 'Office apartment']

- DAYS_BIRTH: 출생일 
    - 데이터 수집 당시 (0)부터 역으로 셈, 즉, -1은 데이터 수집일 하루 전에 태어났음을 의미

- DAYS_EMPLOYED: 업무 시작일
    - 데이터 수집 당시 (0)부터 역으로 셈, 즉, -1은 데이터 수집일 하루 전부터 일을 시작함을 의미 
    - 양수 값은 고용되지 않은 상태를 의미함

- FLAG_MOBIL: 핸드폰 소유 여부

- work_phone: 업무용 전화 소유 여부

- phone: 전화 소유 여부

- email: 이메일 소유 여부

- occyp_type: 직업 유형

- family_size: 가족 규모

- begin_month: 신용카드 발급 월
    - 데이터 수집 당시 (0)부터 역으로 셈, 즉, -1은 데이터 수집일 한 달 전에 신용카드를 발급함을 의미

- feature 
    - category
        - gender
        - car
        - reality
        - income_type
        - edu_type
        - family_type
        - house_type
        - occyp_type
    - numeric
        - child_num
        - DAYS_BIRTH
        - DAYS_EMPLOYED
        - FLAG_MOBIL /
        - work_phone 
        - phone
        - email 
        - family_size /
        - begin_month
- target
    - credit
- drop columns
    - family_size
    - FALG_MOBIL

In [94]:
cat_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occyp_type']
num_cols = ['income_total','DAYS_BIRTH','DAYS_EMPLOYED','begin_month']

- fill na

In [95]:
train['occyp_type'] = train['occyp_type'].fillna('no_job')
test['occyp_type'] = test['occyp_type'].fillna('no_job')

In [96]:
train['MONTHS_BIRTH'] = (np.abs(train['DAYS_BIRTH']) / 30 ).astype('int')
train['MONTHS_EMPLOYED'] = (np.abs(train['DAYS_EMPLOYED']) / 30).astype('int')
train['begin_month'] = np.abs(train['begin_month'])

test['MONTHS_BIRTH'] = (np.abs(test['DAYS_BIRTH']) / 30 ).astype('int')
test['MONTHS_EMPLOYED'] = (np.abs(test['DAYS_EMPLOYED']) / 30).astype('int')
test['begin_month'] = np.abs(test['begin_month'])

for col in ['DAYS_BIRTH','DAYS_EMPLOYED']:
    del train[col]
    del test[col]

- drop columns

In [97]:
train = train[train['child_num'] < 8]
train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit,MONTHS_BIRTH,MONTHS_EMPLOYED
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,1,0,0,0,no_job,2.0,6.0,1.0,463,156
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,1,0,0,1,Laborers,3.0,5.0,1.0,379,51
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,1,0,1,0,Managers,2.0,22.0,2.0,636,147
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,1,0,1,0,Sales staff,2.0,37.0,0.0,502,69
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,1,0,0,0,Managers,2.0,26.0,2.0,501,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,1,0,0,0,Core staff,4.0,2.0,1.0,402,66
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,1,0,0,0,no_job,2.0,47.0,2.0,509,82
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,1,0,0,0,Core staff,2.0,25.0,2.0,336,67
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,1,0,0,0,Laborers,1.0,59.0,2.0,338,3


In [98]:
drop_cols = ['index','FLAG_MOBIL','family_size']
for col in drop_cols:    
    del train[col]
    del test[col]

In [99]:
from sklearn.preprocessing import LabelEncoder

In [100]:
lr_cols = ['gender','car','reality']
dummy_cols = ['income_type','edu_type','family_type','house_type','occyp_type']
lr = LabelEncoder()
for col in lr_cols:
    train[col] = lr.fit_transform(train[col])
    test[col] = lr.fit_transform(test[col])


for col in dummy_cols:
    train = pd.concat([train,pd.get_dummies(train[col])], axis = 1)
    test = pd.concat([test,pd.get_dummies(test[col])], axis = 1)
    del train[col]
    del test[col]

In [101]:
test.head()

Unnamed: 0,gender,car,reality,child_num,income_total,work_phone,phone,email,begin_month,MONTHS_BIRTH,...,Low-skill Laborers,Managers,Medicine staff,Private service staff,Realty agents,Sales staff,Secretaries,Security staff,Waiters/barmen staff,no_job
0,1,1,0,0,112500.0,0,1,0,60.0,733,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,135000.0,0,1,0,36.0,632,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,69372.0,1,1,0,40.0,529,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,112500.0,1,0,0,41.0,642,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,225000.0,1,0,0,8.0,594,...,0,1,0,0,0,0,0,0,0,0


In [102]:
train['income_total'] = np.log1p(train['income_total']).astype('int')

test['income_total'] = np.log1p(test['income_total']).astype('int')

In [103]:
from lightgbm import LGBMClassifier

In [104]:
feature_cols = train.columns.difference(['credit'])
x_train = train[feature_cols]
y_train = train['credit']
x_test = test
x_train.shape, y_train.shape, x_test.shape

((26453, 51), (26453,), (10000, 51))

In [110]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)
y_pred = pd.Series(lgbm.predict(x_test).astype('int'))
y_pred.value_counts()
# pd.Series(y_pred).value_counts()
# sub = pd.concat([sub['index'],y_pred], axis = 1)
# sub.to_csv('result.csv', index = False)

1    9822
0     178
dtype: int64

In [111]:
from sklearn.ensemble import RandomForestClassifier

In [114]:
rf = RandomForestClassifier().fit(x_train, y_train)
y_pred = rf.predict(x_test).astype('int')
pd.Series(y_pred).value_counts()

1    9995
0       5
dtype: int64

In [138]:
sub = pd.concat([sub['index'],pd.get_dummies(y_pred)] , axis = 1)

In [115]:
y_train.value_counts()

2.0    16964
1.0     6267
0.0     3222
Name: credit, dtype: int64