In [37]:
import os
import warnings
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')

In [2]:
train_path = ('./data/train.csv')
test_path = ('./data/test.csv')
submission_path = ('./data/sample_submission.csv')

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)

In [4]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [5]:
test.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,M,Y,N,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,-21990,365243,1,0,1,0,,2.0,-60.0
1,26458,F,N,Y,0,135000.0,State servant,Higher education,Married,House / apartment,-18964,-8671,1,0,1,0,Core staff,2.0,-36.0
2,26459,F,N,Y,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,-15887,-217,1,1,1,0,Laborers,2.0,-40.0
3,26460,M,Y,N,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-19270,-2531,1,1,0,0,Drivers,2.0,-41.0
4,26461,F,Y,Y,0,225000.0,State servant,Higher education,Married,House / apartment,-17822,-9385,1,1,0,0,Managers,2.0,-8.0


In [6]:
submission.head()

Unnamed: 0,index,0,1,2
0,26457,0,0,0
1,26458,0,0,0
2,26459,0,0,0
3,26460,0,0,0
4,26461,0,0,0


In [7]:
# column의 차이를 비교할 때
set(train.columns) - set(test.columns)

{'credit'}

In [8]:
y_train = train['credit']

In [9]:
# train, test 병합
data = pd.concat([train.drop('credit', axis=1), test]).reset_index(drop=True)

In [10]:
print(train.shape)
print(test.shape)
print(data.shape)

(26457, 20)
(10000, 19)
(36457, 19)


In [11]:
data.tail()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
36452,36452,F,Y,Y,0,202500.0,Working,Incomplete higher,Married,House / apartment,-18593,-5434,1,1,1,0,Accountants,2.0,-19.0
36453,36453,M,Y,Y,0,202500.0,Working,Secondary / secondary special,Civil marriage,House / apartment,-10886,-1315,1,1,0,0,Laborers,2.0,-34.0
36454,36454,F,N,Y,0,292500.0,Working,Secondary / secondary special,Married,House / apartment,-21016,-14018,1,0,0,0,Medicine staff,2.0,-55.0
36455,36455,F,Y,N,0,180000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-16541,-1085,1,0,1,0,,2.0,-33.0
36456,36456,F,N,Y,0,270000.0,Working,Higher education,Married,House / apartment,-9154,-187,1,0,0,1,Laborers,2.0,-11.0


In [12]:
# index는 필요없기 때문에 제거
data = data.drop(["index"], axis=1)
data.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0


In [13]:
# null값이 있는지 확인
data.isnull().sum()

gender               0
car                  0
reality              0
child_num            0
income_total         0
income_type          0
edu_type             0
family_type          0
house_type           0
DAYS_BIRTH           0
DAYS_EMPLOYED        0
FLAG_MOBIL           0
work_phone           0
phone                0
email                0
occyp_type       11323
family_size          0
begin_month          0
dtype: int64

In [14]:
data['occyp_type'] = data['occyp_type'].fillna('None')

In [15]:
data.isnull().sum()

gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
DAYS_BIRTH       0
DAYS_EMPLOYED    0
FLAG_MOBIL       0
work_phone       0
phone            0
email            0
occyp_type       0
family_size      0
begin_month      0
dtype: int64

In [16]:
# column 별 unique 값 갯수 확인
for col in data.columns:
    print(f"column : {col}")
    print(f"The number of unique : {data[col].nunique()}")
    print()

column : gender
The number of unique : 2

column : car
The number of unique : 2

column : reality
The number of unique : 2

column : child_num
The number of unique : 9

column : income_total
The number of unique : 265

column : income_type
The number of unique : 5

column : edu_type
The number of unique : 5

column : family_type
The number of unique : 5

column : house_type
The number of unique : 6

column : DAYS_BIRTH
The number of unique : 7183

column : DAYS_EMPLOYED
The number of unique : 3640

column : FLAG_MOBIL
The number of unique : 1

column : work_phone
The number of unique : 2

column : phone
The number of unique : 2

column : email
The number of unique : 2

column : occyp_type
The number of unique : 19

column : family_size
The number of unique : 10

column : begin_month
The number of unique : 61



In [17]:
# column 별 unique 값 확인
for col in data.columns:
    print(f"column : {col}")
    print(f"unique : {data[col].unique()}")
    print()

column : gender
unique : ['F' 'M']

column : car
unique : ['N' 'Y']

column : reality
unique : ['N' 'Y']

column : child_num
unique : [ 0  1  2  3  4  5 14 19  7]

column : income_total
unique : [ 202500.   247500.   450000.   157500.   270000.   315000.   180000.
  103500.   193500.   135000.   216000.   225000.   405000.   112500.
  121500.   585000.    90900.    99000.    67500.   292500.    90000.
  495000.    54000.   306000.   130500.   144000.   360000.    40500.
   81000.   351000.   184500.    45000.    79650.   153000.   126000.
  382500.   283500.   175500.   297000.   387000.   337500.   256500.
   72000.   540000.   155250.   189000.   139500.   459000.   720000.
  252000.   171000.   427500.    78750.    94500.   562500.   238500.
  630000.    76500.   301500.   229500.   148500.   166500.   463500.
  432000.   117000.   787500.   234000.   108000.   279000.   900000.
  378000.    31500.    29250.   211500.   207000.    49500.    65250.
  265500.   261000.   162000.   310

In [18]:
data['DAYS_BIRTH']

0       -13899
1       -11380
2       -19087
3       -15088
4       -15037
         ...  
36452   -18593
36453   -10886
36454   -21016
36455   -16541
36456    -9154
Name: DAYS_BIRTH, Length: 36457, dtype: int64

In [19]:
# Categorical data One-Hot Encoding, 데이터 타입은 string 타입이어야 한다.
data = pd.get_dummies(data, columns=['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type'])

In [20]:
data.shape

(36457, 56)

In [21]:
data.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_None,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
0,0,202500.0,-13899,-4709,1,0,0,0,2.0,-6.0,...,0,0,0,1,0,0,0,0,0,0
1,1,247500.0,-11380,-1540,1,0,0,1,3.0,-5.0,...,0,0,0,0,0,0,0,0,0,0
2,0,450000.0,-19087,-4434,1,0,1,0,2.0,-22.0,...,0,1,0,0,0,0,0,0,0,0
3,0,202500.0,-15088,-2092,1,0,1,0,2.0,-37.0,...,0,0,0,0,0,0,1,0,0,0
4,0,157500.0,-15037,-2105,1,0,0,0,2.0,-26.0,...,0,1,0,0,0,0,0,0,0,0


In [22]:
X_train = data[:len(train)]
X_test = data[len(train):].reset_index(drop=True)

In [23]:
print(X_train.shape)
print(X_test.shape)

(26457, 56)
(10000, 56)


In [24]:
X_train.tail()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_None,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
26452,2,225000.0,-12079,-1984,1,0,0,0,4.0,-2.0,...,0,0,0,0,0,0,0,0,0,0
26453,1,180000.0,-15291,-2475,1,0,0,0,2.0,-47.0,...,0,0,0,1,0,0,0,0,0,0
26454,0,292500.0,-10082,-2015,1,0,0,0,2.0,-25.0,...,0,0,0,0,0,0,0,0,0,0
26455,0,171000.0,-10145,-107,1,0,0,0,1.0,-59.0,...,0,0,0,0,0,0,0,0,0,0
26456,0,81000.0,-19569,-1013,1,0,0,0,2.0,-9.0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
X_test.tail()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,occyp_type_Low-skill Laborers,occyp_type_Managers,occyp_type_Medicine staff,occyp_type_None,occyp_type_Private service staff,occyp_type_Realty agents,occyp_type_Sales staff,occyp_type_Secretaries,occyp_type_Security staff,occyp_type_Waiters/barmen staff
9995,0,202500.0,-18593,-5434,1,1,1,0,2.0,-19.0,...,0,0,0,0,0,0,0,0,0,0
9996,0,202500.0,-10886,-1315,1,1,0,0,2.0,-34.0,...,0,0,0,0,0,0,0,0,0,0
9997,0,292500.0,-21016,-14018,1,0,0,0,2.0,-55.0,...,0,0,1,0,0,0,0,0,0,0
9998,0,180000.0,-16541,-1085,1,0,1,0,2.0,-33.0,...,0,0,0,1,0,0,0,0,0,0
9999,0,270000.0,-9154,-187,1,0,0,1,2.0,-11.0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
params= {
    "n_estimators": 500,
    "random_state": 42,
}

In [39]:
rf = RandomForestClassifier(**params)

In [46]:
# log scaling
y_train = np.log1p(y_train)

In [40]:
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=42)

In [41]:
pred = rf.predict_proba(X_test)

In [42]:
pred

array([[0.028     , 0.194     , 0.778     ],
       [0.564     , 0.18      , 0.256     ],
       [0.026     , 0.0525    , 0.9215    ],
       ...,
       [0.004     , 0.036     , 0.96      ],
       [0.596     , 0.264     , 0.14      ],
       [0.068     , 0.33910476, 0.59289524]])

In [43]:
submission.iloc[:,1:] = pred

In [44]:
submission

Unnamed: 0,index,0,1,2
0,26457,0.028000,0.194000,0.778000
1,26458,0.564000,0.180000,0.256000
2,26459,0.026000,0.052500,0.921500
3,26460,0.026600,0.028000,0.945400
4,26461,0.076500,0.208750,0.714750
...,...,...,...,...
9995,36452,0.063833,0.359100,0.577067
9996,36453,0.218000,0.624200,0.157800
9997,36454,0.004000,0.036000,0.960000
9998,36455,0.596000,0.264000,0.140000


In [45]:
# submission 파일로 저장
submission.to_csv("./data/submission_baseline_rf.csv", index=False)

In [46]:
pd.read_csv('./data/submission_baseline_rf.csv')

Unnamed: 0,index,0,1,2
0,26457,0.028000,0.194000,0.778000
1,26458,0.564000,0.180000,0.256000
2,26459,0.026000,0.052500,0.921500
3,26460,0.026600,0.028000,0.945400
4,26461,0.076500,0.208750,0.714750
...,...,...,...,...
9995,36452,0.063833,0.359100,0.577067
9996,36453,0.218000,0.624200,0.157800
9997,36454,0.004000,0.036000,0.960000
9998,36455,0.596000,0.264000,0.140000


In [61]:
data.to_feather("data.ftr")
data.to_pickle("data.pkl")

In [62]:
pd.read_feather("data.ftr")

Unnamed: 0,age,reputation,stat_overall,stat_potential,stat_skill_moves,contract_until_2018,contract_until_2019,contract_until_2020,contract_until_2021,contract_until_2022,...,continent_asia,continent_europe,continent_oceania,continent_south america,position_DF,position_GK,position_MF,position_ST,prefer_foot_left,prefer_foot_right
0,31,5.0,94,94,4.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,1,0
1,27,4.0,91,93,1.0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
2,31,5.0,91,91,3.0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1
3,32,4.0,91,91,3.0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
4,25,3.0,90,93,1.0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12755,20,1.0,48,63,2.0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
12756,18,1.0,48,65,1.0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
12757,18,1.0,48,65,2.0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
12758,18,1.0,47,61,2.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,1
