### ibovespa-stocks 브라질 주식 데이터 분석
* 브라질 증권 거래소의 주식 - Stocks from the Brazilian stock exchange
* 데이터 출처 : https://www.kaggle.com/datasets/felsal/ibovespa-stocks

##### 대회 개요
* 1994년 2020년까지의 브라질 증권 B3증권 거래소에서 거래된  주식 정보
* 데이터 파일
       b3_stocks_1994_2020.csv
       selic.csv
       usd2brl.csv

##### 데이터 설명
* Input variables

      datetime : 날짜, 시간
      ticker   : 
      open     : 시작가
      close    : 종가
      high     : 최고가
      low      : 최저가
      volume   : 거래량
* Output variable
      close_open : 종가 - 시작가

In [1]:
# 라이브러리 불러오기

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
# 데이터 불러오기
b3_stock = pd.read_csv("b3_stocks_1994_2020.csv")
selic = pd.read_csv("selic.csv")
usb2brl = pd.read_csv("usd2brl.csv")

b3_stock.shape, selic.shape, usb2brl.shape

((1883203, 7), (6651, 2), (6651, 2))

In [3]:
print( b3_stock.columns )
print( selic.columns)
print( usb2brl.columns)

Index(['datetime', 'ticker', 'open', 'close', 'high', 'low', 'volume'], dtype='object')
Index(['datetime', 'selic'], dtype='object')
Index(['datetime', 'usd_brl'], dtype='object')


In [4]:
b3_stock.head()

Unnamed: 0,datetime,ticker,open,close,high,low,volume
0,1994-07-04,ACE 3,48.0,48.0,48.0,47.0,46550.0
1,1994-07-04,ALP 3,155.27,156.0,156.0,155.27,163405.8
2,1994-07-04,ALP 4,131.0,131.0,131.0,131.0,6550.0
3,1994-07-04,IBP 6,600.0,600.0,600.0,600.0,7800.0
4,1994-07-04,AQT 4,0.89,0.99,0.99,0.85,13137.0


In [5]:
selic.head()

Unnamed: 0,datetime,selic
0,1994-07-04,0.003963
1,1994-07-05,0.003997
2,1994-07-06,0.003983
3,1994-07-07,0.003997
4,1994-07-08,0.003937


In [6]:
usb2brl.head()

Unnamed: 0,datetime,usd_brl
0,1994-07-04,0.94
1,1994-07-05,0.932
2,1994-07-06,0.915
3,1994-07-07,0.91
4,1994-07-08,0.92


In [7]:
b3_stock.info()  # 180만개 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1883203 entries, 0 to 1883202
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   datetime  object 
 1   ticker    object 
 2   open      float64
 3   close     float64
 4   high      float64
 5   low       float64
 6   volume    float64
dtypes: float64(5), object(2)
memory usage: 100.6+ MB


In [8]:
b3_stock.describe()

Unnamed: 0,open,close,high,low,volume
count,1883203.0,1883203.0,1883203.0,1883203.0,1883203.0
mean,68.14208,68.27674,68.82414,67.54027,16682860.0
std,1689.781,1695.07,1699.966,1683.604,102634400.0
min,0.01,0.0,0.01,0.01,0.0
25%,3.96,3.96,4.0,3.89,18005.0
50%,13.69,13.7,13.92,13.45,252656.0
75%,36.7,36.73,37.14,36.08,4794014.0
max,1297776.0,1297776.0,1297776.0,1297776.0,42983800000.0


In [9]:
selic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6651 entries, 0 to 6650
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   datetime  6651 non-null   object 
 1   selic     6651 non-null   float64
dtypes: float64(1), object(1)
memory usage: 104.0+ KB


In [10]:
usb2brl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6651 entries, 0 to 6650
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   datetime  6651 non-null   object 
 1   usd_brl   6651 non-null   float64
dtypes: float64(1), object(1)
memory usage: 104.0+ KB


* 과제 - 시작가에 비해 종가가 얼마가 달라지는가에 대한 예측 모델 만들기

In [11]:
b3_stock.head()

Unnamed: 0,datetime,ticker,open,close,high,low,volume
0,1994-07-04,ACE 3,48.0,48.0,48.0,47.0,46550.0
1,1994-07-04,ALP 3,155.27,156.0,156.0,155.27,163405.8
2,1994-07-04,ALP 4,131.0,131.0,131.0,131.0,6550.0
3,1994-07-04,IBP 6,600.0,600.0,600.0,600.0,7800.0
4,1994-07-04,AQT 4,0.89,0.99,0.99,0.85,13137.0


In [12]:
# 새로운 컬럼 추가 : 종가 - 시작가
b3_stock['close_open'] = b3_stock['close'] - b3_stock['open']
b3_stock.head() 

Unnamed: 0,datetime,ticker,open,close,high,low,volume,close_open
0,1994-07-04,ACE 3,48.0,48.0,48.0,47.0,46550.0,0.0
1,1994-07-04,ALP 3,155.27,156.0,156.0,155.27,163405.8,0.73
2,1994-07-04,ALP 4,131.0,131.0,131.0,131.0,6550.0,0.0
3,1994-07-04,IBP 6,600.0,600.0,600.0,600.0,7800.0,0.0
4,1994-07-04,AQT 4,0.89,0.99,0.99,0.85,13137.0,0.1


In [13]:
### ticker은 몇개의 값이 존재할까?
len(b3_stock['ticker'].unique())

3397

In [14]:
b3_stock.columns

Index(['datetime', 'ticker', 'open', 'close', 'high', 'low', 'volume',
       'close_open'],
      dtype='object')

In [15]:
### target : close_open
### input : open, close, high, low, volume
sel = ['open', 'close', 'high', 'low', 'volume']

X_tr = b3_stock[sel]
y_tr = b3_stock['close_open']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.3, random_state=0)

model1 = DecisionTreeRegressor()
model1.fit(X_train, y_train)

model2 = RandomForestRegressor(max_depth=3, n_estimators=5, random_state=0, n_jobs=-1)
model2.fit(X_train, y_train)

RandomForestRegressor(max_depth=3, n_estimators=5, n_jobs=-1, random_state=0)

In [17]:
print("의사결정트리 score :", model1.score(X_train, y_train), model1.score(X_test, y_test) )
print("랜덤포레스트 score :", model2.score(X_train, y_train), model2.score(X_test, y_test) )

의사결정트리 score : 1.0 -4.233173620569298
랜덤포레스트 score : 0.6806127222430476 -1.6176576427329867


* 성능이 잘 나오지 않음. 그러면 우선적으로 가격이 오르는지, 내리는지 예측 모델을 만들어볼까?

In [18]:
b3_stock.loc[  b3_stock['close_open'] > 0, "stock_up" ] = 1
b3_stock.loc[  b3_stock['close_open'] <= 0, "stock_up" ] = 0
b3_stock['stock_up'] = b3_stock['stock_up'].astype("int16")
b3_stock.head()

Unnamed: 0,datetime,ticker,open,close,high,low,volume,close_open,stock_up
0,1994-07-04,ACE 3,48.0,48.0,48.0,47.0,46550.0,0.0,0
1,1994-07-04,ALP 3,155.27,156.0,156.0,155.27,163405.8,0.73,1
2,1994-07-04,ALP 4,131.0,131.0,131.0,131.0,6550.0,0.0,0
3,1994-07-04,IBP 6,600.0,600.0,600.0,600.0,7800.0,0.0,0
4,1994-07-04,AQT 4,0.89,0.99,0.99,0.85,13137.0,0.1,1


In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [20]:
### target : close_open
### input : open, close, high, low, volume
sel = ['open', 'close', 'high', 'low', 'volume']

X_tr = b3_stock[sel]
y_tr = b3_stock['stock_up']

X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.3, random_state=0)

In [21]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
print("의사결정트리 score :", model1.score(X_train, y_train), model1.score(X_test, y_test) )

의사결정트리 score : 1.0 0.9719290358095515


In [22]:
%%time

model2 = RandomForestClassifier(max_depth=3, n_estimators=50, random_state=0, n_jobs=-1)
model2.fit(X_train, y_train)
print("랜덤포레스트 score :", model2.score(X_train, y_train), model2.score(X_test, y_test) )

랜덤포레스트 score : 0.6374595863278518 0.6379183695865732
CPU times: user 2min 3s, sys: 310 ms, total: 2min 3s
Wall time: 1min 4s


* 최종 모델 - 주가가 올랐는지 내렸는지 확인해 보기

In [23]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
print("의사결정트리 score :", model1.score(X_train, y_train), model1.score(X_test, y_test) )

의사결정트리 score : 1.0 0.971822833788527
