머신러닝 예측 모델 내용을 담습니다.

상세 내용 https://songseungwon.tistory.com/130

# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
plt.style.use('default')

In [2]:
stocks = yf.Ticker('TSLA')

In [3]:
df = stocks.history(period='max')
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29 00:00:00-04:00,1.266667,1.666667,1.169333,1.592667,281494500,0.0,0.0
2010-06-30 00:00:00-04:00,1.719333,2.028000,1.553333,1.588667,257806500,0.0,0.0
2010-07-01 00:00:00-04:00,1.666667,1.728000,1.351333,1.464000,123282000,0.0,0.0
2010-07-02 00:00:00-04:00,1.533333,1.540000,1.247333,1.280000,77097000,0.0,0.0
2010-07-06 00:00:00-04:00,1.333333,1.333333,1.055333,1.074000,103003500,0.0,0.0
...,...,...,...,...,...,...,...
2024-06-20 00:00:00-04:00,184.679993,185.210007,179.660004,181.570007,55893100,0.0,0.0
2024-06-21 00:00:00-04:00,182.300003,183.949997,180.690002,183.009995,63029500,0.0,0.0
2024-06-24 00:00:00-04:00,184.970001,188.800003,182.550003,182.580002,61992100,0.0,0.0
2024-06-25 00:00:00-04:00,184.399994,187.970001,182.009995,187.350006,63678300,0.0,0.0


# 머신러닝

## 지도학습

In [4]:
df['up'] = [1 if x >= 0 else 0 for x in df['Close'].diff()]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-06-29 00:00:00-04:00,1.266667,1.666667,1.169333,1.592667,281494500,0.0,0.0,0
2010-06-30 00:00:00-04:00,1.719333,2.028000,1.553333,1.588667,257806500,0.0,0.0,0
2010-07-01 00:00:00-04:00,1.666667,1.728000,1.351333,1.464000,123282000,0.0,0.0,0
2010-07-02 00:00:00-04:00,1.533333,1.540000,1.247333,1.280000,77097000,0.0,0.0,0
2010-07-06 00:00:00-04:00,1.333333,1.333333,1.055333,1.074000,103003500,0.0,0.0,0
...,...,...,...,...,...,...,...,...
2024-06-20 00:00:00-04:00,184.679993,185.210007,179.660004,181.570007,55893100,0.0,0.0,0
2024-06-21 00:00:00-04:00,182.300003,183.949997,180.690002,183.009995,63029500,0.0,0.0,1
2024-06-24 00:00:00-04:00,184.970001,188.800003,182.550003,182.580002,61992100,0.0,0.0,0
2024-06-25 00:00:00-04:00,184.399994,187.970001,182.009995,187.350006,63678300,0.0,0.0,1


In [5]:
df['Close'].diff()

Date
2010-06-29 00:00:00-04:00         NaN
2010-06-30 00:00:00-04:00   -0.004000
2010-07-01 00:00:00-04:00   -0.124667
2010-07-02 00:00:00-04:00   -0.184000
2010-07-06 00:00:00-04:00   -0.206000
                               ...   
2024-06-20 00:00:00-04:00   -3.289993
2024-06-21 00:00:00-04:00    1.439987
2024-06-24 00:00:00-04:00   -0.429993
2024-06-25 00:00:00-04:00    4.770004
2024-06-26 00:00:00-04:00    9.019989
Name: Close, Length: 3522, dtype: float64

In [6]:
df = df[['Open', 'High', 'Low', 'Close', 'Volume', 'up']]
df.columns = ['open', 'high','low', 'close', 'volume', 'up']
df

Unnamed: 0_level_0,open,high,low,close,volume,up
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29 00:00:00-04:00,1.266667,1.666667,1.169333,1.592667,281494500,0
2010-06-30 00:00:00-04:00,1.719333,2.028000,1.553333,1.588667,257806500,0
2010-07-01 00:00:00-04:00,1.666667,1.728000,1.351333,1.464000,123282000,0
2010-07-02 00:00:00-04:00,1.533333,1.540000,1.247333,1.280000,77097000,0
2010-07-06 00:00:00-04:00,1.333333,1.333333,1.055333,1.074000,103003500,0
...,...,...,...,...,...,...
2024-06-20 00:00:00-04:00,184.679993,185.210007,179.660004,181.570007,55893100,0
2024-06-21 00:00:00-04:00,182.300003,183.949997,180.690002,183.009995,63029500,1
2024-06-24 00:00:00-04:00,184.970001,188.800003,182.550003,182.580002,61992100,0
2024-06-25 00:00:00-04:00,184.399994,187.970001,182.009995,187.350006,63678300,1


In [7]:
x = df[['open', 'high','low', 'close', 'volume']]
y = df['up']

In [8]:
# 메소드
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def show_ml_acc(X, y, model, diff=False):
    if not diff :
        X_train, X_test, y_train, y_test = train_test_split(
            X.iloc[:-1],
            y.iloc[1:],
            test_size=0.2,
            shuffle=True,
            random_state=42
        )
    else :
        X_train, X_test, y_train, y_test = train_test_split(
            X.iloc[1:-1],
            y.iloc[2:],
            test_size=0.2,
            shuffle=True,
            random_state=42
        )

    print(f'Model : {model.__class__.__name__}')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = (y_pred==y_test).sum()/len(y_pred)
    print(f'Accuracy : {acc:.2f}')
    print(classification_report(y_test, y_pred))
    return acc

## 평가

In [9]:
# 모델
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

lr = LogisticRegression()
rfc = RandomForestClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()

In [10]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(x, y, model, diff=False))
print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.50
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       351
           1       0.50      1.00      0.67       354

    accuracy                           0.50       705
   macro avg       0.25      0.50      0.33       705
weighted avg       0.25      0.50      0.34       705

Model : RandomForestClassifier


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.53      0.46      0.49       351
           1       0.52      0.58      0.55       354

    accuracy                           0.52       705
   macro avg       0.52      0.52      0.52       705
weighted avg       0.52      0.52      0.52       705

Model : XGBClassifier
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.51      0.48      0.50       351
           1       0.51      0.55      0.53       354

    accuracy                           0.51       705
   macro avg       0.51      0.51      0.51       705
weighted avg       0.51      0.51      0.51       705

Model : LGBMClassifier
[LightGBM] [Info] Number of positive: 1472, number of negative: 1344
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[Lig

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
        x.iloc[:-1],
        y.iloc[1:],
        test_size=0.2,
        shuffle=True,
        random_state=42
    )

lr.fit(X_train, y_train)

In [12]:
lr.predict_proba(X_test)

array([[0.43203847, 0.56796153],
       [0.49957264, 0.50042736],
       [0.49793532, 0.50206468],
       ...,
       [0.49738249, 0.50261751],
       [0.4955043 , 0.5044957 ],
       [0.48383608, 0.51616392]])

In [13]:
lr.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [14]:
## 차분 데이터 평가

In [15]:
x_diff = x.diff()
x_diff

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29 00:00:00-04:00,,,,,
2010-06-30 00:00:00-04:00,0.452666,0.361333,0.384000,-0.004000,-23688000.0
2010-07-01 00:00:00-04:00,-0.052666,-0.300000,-0.202000,-0.124667,-134524500.0
2010-07-02 00:00:00-04:00,-0.133334,-0.188000,-0.104000,-0.184000,-46185000.0
2010-07-06 00:00:00-04:00,-0.200000,-0.206667,-0.192000,-0.206000,25906500.0
...,...,...,...,...,...
2024-06-20 00:00:00-04:00,-1.880005,-1.989990,-2.709991,-3.289993,-13089200.0
2024-06-21 00:00:00-04:00,-2.379990,-1.260010,1.029999,1.439987,7136400.0
2024-06-24 00:00:00-04:00,2.669998,4.850006,1.860001,-0.429993,-1037400.0
2024-06-25 00:00:00-04:00,-0.570007,-0.830002,-0.540009,4.770004,1686200.0


In [16]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(x_diff, y, model, diff=True))

print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression


Accuracy : 0.48
              precision    recall  f1-score   support

           0       0.47      0.54      0.51       349
           1       0.48      0.41      0.44       355

    accuracy                           0.48       704
   macro avg       0.48      0.48      0.47       704
weighted avg       0.48      0.48      0.47       704

Model : RandomForestClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.52      0.45      0.48       349
           1       0.52      0.59      0.55       355

    accuracy                           0.52       704
   macro avg       0.52      0.52      0.52       704
weighted avg       0.52      0.52      0.52       704

Model : XGBClassifier
Accuracy : 0.49
              precision    recall  f1-score   support

           0       0.49      0.44      0.46       349
           1       0.50      0.55      0.52       355

    accuracy                           0.49       704
   macro avg       0.49    

## 로그 차분 평가

In [17]:
x_log_diff = np.log(x).diff()
x_log_diff

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29 00:00:00-04:00,,,,,
2010-06-30 00:00:00-04:00,0.305547,0.196224,0.283969,-0.002515,-0.087904
2010-07-01 00:00:00-04:00,-0.031111,-0.160085,-0.139311,-0.081723,-0.737735
2010-07-02 00:00:00-04:00,-0.083382,-0.115182,-0.080084,-0.134312,-0.469410
2010-07-06 00:00:00-04:00,-0.139762,-0.144101,-0.167151,-0.175470,0.289699
...,...,...,...,...,...
2024-06-20 00:00:00-04:00,-0.010128,-0.010687,-0.014971,-0.017957,-0.210409
2024-06-21 00:00:00-04:00,-0.012971,-0.006826,0.005717,0.007899,0.120162
2024-06-24 00:00:00-04:00,0.014540,0.026024,0.010241,-0.002352,-0.016596
2024-06-25 00:00:00-04:00,-0.003086,-0.004406,-0.002963,0.025790,0.026837


In [18]:
acc_li = []
for model in [lr, rfc, xgb, lgb]:
    acc_li.append(show_ml_acc(x_log_diff, y, model, diff=True))

print('='*50)
print(f'Mean of Acc : {np.mean(acc_li)}')

Model : LogisticRegression
Accuracy : 0.50
              precision    recall  f1-score   support

           0       0.50      0.04      0.07       349
           1       0.50      0.96      0.66       355

    accuracy                           0.50       704
   macro avg       0.50      0.50      0.37       704
weighted avg       0.50      0.50      0.37       704

Model : RandomForestClassifier
Accuracy : 0.51
              precision    recall  f1-score   support

           0       0.50      0.46      0.48       349
           1       0.51      0.55      0.53       355

    accuracy                           0.51       704
   macro avg       0.51      0.51      0.50       704
weighted avg       0.51      0.51      0.50       704

Model : XGBClassifier
Accuracy : 0.52
              precision    recall  f1-score   support

           0       0.51      0.48      0.50       349
           1       0.52      0.55      0.53       355

    accuracy                           0.52       704
