In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('./data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


```
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
```

In [3]:
nCar = data.shape[0]
nVar = data.shape[1]
print(nCar)
print(nVar)

61878
95


In [5]:
data.drop(['id'], axis=1, inplace = True)

In [7]:
mapping_dict = {'Class_1':1,
                'Class_2':2,
                'Class_3':3,
                'Class_4':4,
                'Class_5':5,
                'Class_6':6,
                'Class_7':7,
                'Class_8':8,
                'Class_9':9,}

In [11]:
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

In [12]:
after_mapping_target

0        1
1        1
2        1
3        1
4        1
        ..
61873    9
61874    9
61875    9
61876    9
61877    9
Name: target, Length: 61878, dtype: int64

In [13]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


### 학습 데이터를 랜덤포레스트 모형에 적용

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [16]:
clf = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=0)
clf.fit(train_x, train_y)

RandomForestClassifier(max_depth=5, n_estimators=20, random_state=0)

In [18]:
predict1 = clf.predict(test_x)
print(accuracy_score(test_y, predict1))

0.5929217840982547


In [19]:
# 트리 수 증가
clf = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=0)
clf.fit(train_x, train_y)

RandomForestClassifier(max_depth=5, n_estimators=300, random_state=0)

In [20]:
predict2 = clf.predict(test_x)
print(accuracy_score(test_y, predict2))

0.608678086619263


In [21]:
# 트리의 깊이 증가
clf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=0)
clf.fit(train_x, train_y)
predict3 = clf.predict(test_x)
print(accuracy_score(test_y, predict3))

0.7798157724628313


In [22]:
# 트리의 깊이 최대
clf = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=0)
clf.fit(train_x, train_y)
predict4 = clf.predict(test_x)
print(accuracy_score(test_y, predict4))

0.8119747899159664
