# 랜덤 포레스트 기법

~~~
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=40, max_features=8, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_current_pred = rnd_clf.predict(X_new)
~~~

1. `sklearn` 모듈을 import
2. `RandomForestClassifier()`라는 객체(클래스)를 통해 `rnd_clf`라는 인스턴스 생성
3. `rnd_clf`의 메서드인 `fit()` 함수를 통해 X와 y에 대한 학습 진행
4. `rnd_clf`의 메서드인 `predict()` 함수를 통해 새로운 X를 통해 y에 대한 예측

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as mt
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.metrics import confusion_matrix

In [2]:
model_data = pd.read_csv("data.csv")

In [3]:
X = model_data.iloc[:, :4]
X_names = X.columns
y = model_data["label_ko"]

X_past = X[y.notna()]
y_past = y[y.notna()]

In [4]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X_past, y_past):
    X_train, X_test = X_past.iloc[train_index,], X_past.iloc[test_index,]
    y_train, y_test = y_past[train_index], y_past[test_index]

In [5]:
rnd_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
param_dist_rf = {
'n_estimators':[50, 100, 500],
'max_leaf_nodes':[20, 30, 40, 50],
'max_features':[2, 3, 4]
}
rnd_search = RandomizedSearchCV(rnd_clf, param_dist_rf, cv=10, random_state=42)
rnd_search.fit(X_train, y_train)
print(rnd_search.best_params_)

{'n_estimators': 50, 'max_leaf_nodes': 50, 'max_features': 3}


In [6]:
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=40, max_features=4, n_jobs=-1, random_state=42)
rnd_scores = cross_val_score(rnd_clf, X_train, y_train, cv = 5)
print("\n<5-fold cross-validation>")
print("accuracy score mean: ", rnd_scores.mean())


<5-fold cross-validation>
accuracy score mean:  0.7529007756948933


In [7]:
rnd_clf.fit(X_train, y_train)
print("\n<AI model: machine learning done >")
print("accuracy_score of train data(0.8 of sample): ", rnd_clf.score(X_train, y_train))


<AI model: machine learning done >
accuracy_score of train data(0.8 of sample):  0.8651026392961877


In [8]:
print("accuracy_score of test data(0.2 of sample): ", rnd_clf.score(X_test, y_test))
y_test_pred = rnd_clf.predict(X_test)
print("accuracy_score of test data: ", mt.accuracy_score(y_test, y_test_pred))

accuracy_score of test data(0.2 of sample):  0.7624633431085044
accuracy_score of test data:  0.7624633431085044


In [9]:
y_test_pred = rnd_clf.predict(X_test)
cm1= confusion_matrix(y_test, y_test_pred, labels=["up","neutral","down"])
print("\n<Confusion matrix>")
print("(of test)")
print("up","neutral","down")
print(cm1)


<Confusion matrix>
(of test)
up neutral down
[[ 76  15  11]
 [ 10  60  22]
 [  7  16 124]]


In [10]:
cm2= confusion_matrix(y_past, rnd_clf.predict(X_past), labels=["up","neutral","down"])
print("(of all)")
print("up","neutral","down")
print(cm2)

(of all)
up neutral down
[[433  35  40]
 [ 48 341  73]
 [ 22  47 666]]


In [11]:
print("\n<Feature importance>")
for name, score in zip(X.columns, rnd_clf.feature_importances_):
    print(name, ": ", score)


<Feature importance>
krw :  0.3937936026579611
vko :  0.20050456499441915
wti :  0.29812683537099693
cboe :  0.10757499697662289


In [12]:
y_prediction = rnd_clf.predict(X)
y_pred = pd.Series(y_prediction, index=y.index)

In [13]:
joblib.dump(rnd_clf, "forecast_model.pkl")
print("\n< AI model: save >")


< AI model: save >
