In [257]:
import pandas as pd
import numpy as np
import talib
import datetime as dt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

In [258]:
minDf = pd.read_csv('./Stock_Trading_System/32bit/hapminstockdata/루닛 분 단위 주식 데이터.csv', encoding='euc-kr')
dayDf = pd.read_csv('./Stock_Trading_System/32bit/hapdaystockdata/루닛 일 단위 주식 데이터.csv', encoding='euc-kr')
newsDf = pd.read_csv('./Stock_Trading_System/32bit/hapnewsdata/루닛 뉴스 데이터.csv', encoding='utf-8')

In [259]:
upper, middle, lower = talib.BBANDS(dayDf['종가'], timeperiod=20)
macd, macdsignal, macdhist = talib.MACD(dayDf['종가'], fastperiod=12, slowperiod=26, signalperiod=9)
ar_up, ar_dn = talib.AROON(dayDf['고가'], dayDf['저가'], timeperiod=14)
resultDf = pd.DataFrame({
    '종목코드':dayDf['종목코드'],
    '종목명':dayDf['종목명'],
    '날짜':dayDf['날짜'],
    'SMA5':talib.SMA(dayDf['종가'], timeperiod=5),
    'SMA20':talib.SMA(dayDf['종가'], timeperiod=10),
    'UPPER':upper,
    'MAVG':middle,
    'LOWER':lower,
    'RSI':talib.RSI(dayDf['종가'], timeperiod=14),
    'MACD':macd,
    'AROONUP':ar_up,
    'AROONDN':ar_dn
    })
resultDf = resultDf[33:].reset_index()
resultDf.drop('index', axis=1, inplace=True)

In [260]:
minDf[['SMA5','SMA20','UPPER','MAVG','LOWER','RSI','MACD','AROONUP','AROONDN','NEWS','LABEL']] = np.nan

In [261]:
tokenizer = AutoTokenizer.from_pretrained('snunlp/KR-FinBert-SC')
model = AutoModelForSequenceClassification.from_pretrained('snunlp/KR-FinBert-SC')
senti_classifier = pipeline(task='text-classification', model=model, tokenizer=tokenizer)

In [262]:
newsTime = []
newsLabel = []
for k in tqdm(range(newsDf.shape[0])):
    score = 0
    list_text = newsDf['내용'][k].split('. ')
    for i in range(len(list_text)):
        if len(list_text[i]) <= 550:
            if senti_classifier(list_text[i])[0]['label'] == 'positive':
                score += 1
            elif senti_classifier(list_text[i])[0]['label'] == 'negative':
                score -= 1
            else:
                pass
    changeTime = dt.datetime(int(str(newsDf['날짜'][i])[:4]), int(str(newsDf['날짜'][i])[5:7]), int(str(newsDf['날짜'][i])[8:10]), int(str(newsDf['날짜'][i])[11:13]), int(str(newsDf['날짜'][i])[14:16]))
    newsTime.append(changeTime.strftime('%Y%m%d %H%M'))
    newsLabel.append(score)
newsLabelDf = pd.DataFrame({
    'date':newsTime,
    'label':newsLabel
})

100%|██████████| 406/406 [18:32<00:00,  2.74s/it]


In [264]:
for i in tqdm(range(resultDf.shape[0])):
    for j in range(minDf.shape[0]):
        if resultDf['날짜'][i] == minDf['날짜'][j]:
            minDf['SMA5'][j] = resultDf['SMA5'][i]
            minDf['SMA20'][j] = resultDf['SMA20'][i]
            minDf['UPPER'][j] = resultDf['UPPER'][i]
            minDf['MAVG'][j] = resultDf['MAVG'][i]
            minDf['LOWER'][j] = resultDf['LOWER'][i]
            minDf['RSI'][j] = resultDf['RSI'][i]
            minDf['MACD'][j] = resultDf['MACD'][i]
            minDf['AROONUP'][j] = resultDf['AROONUP'][i]
            minDf['AROONDN'][j] = resultDf['AROONDN'][i]

  0%|          | 0/214 [00:00<?, ?it/s]

100%|██████████| 214/214 [08:38<00:00,  2.42s/it]
100%|██████████| 406/406 [09:18<00:00,  1.38s/it]


In [None]:
for i in tqdm(range(newsLabelDf.shape[0])):
    for j in range(minDf.shape[0]):
        if int(newsLabelDf['date'][i][:8]) == minDf['날짜'][j]:
            if int(newsLabelDf['date'][i][9:]) == minDf['시간'][j]:
                minDf['NEWS'][j] = newsLabelDf['label'][i]
minDf['NEWS'].fillna(0, inplace=True)
resultHapDf = minDf[minDf['SMA5'].notnull()]
resultHapDf.reset_index(inplace=True)
resultHapDf.drop('index', axis=1, inplace=True)   

In [313]:
for i in tqdm(range(resultHapDf.shape[0])):
    if i <= int(resultHapDf.shape[0]) - 2:
        if resultHapDf['종가'].iloc[i] > resultHapDf['종가'].iloc[i+1]:
            resultHapDf['LABEL'].iloc[i+1] = 1
        elif resultHapDf['종가'].iloc[i] < resultHapDf['종가'].iloc[i+1]:
            resultHapDf['LABEL'].iloc[i+1] = 2
        elif resultHapDf['종가'].iloc[i] == resultHapDf['종가'].iloc[i+1]:
            resultHapDf['LABEL'].iloc[i+1] = 0
resultHapDf['LABEL'].fillna(0, inplace=True)

100%|██████████| 77756/77756 [01:23<00:00, 931.76it/s] 


In [317]:
resultHapDf

Unnamed: 0,종목코드,종목명,날짜,시간,시가,고가,저가,종가,거래량,SMA5,SMA20,UPPER,MAVG,LOWER,RSI,MACD,AROONUP,AROONDN,NEWS,LABEL
0,A328130,루닛,20220907,901,33500.0,33500.0,33450.0,0.0,190.0,34300.0,36065.0,43211.232571,37787.5,32363.767429,32.432582,-1861.666357,100.000000,7.142857,0.0,0.0
1,A328130,루닛,20220907,902,33650.0,33650.0,33550.0,33650.0,210.0,34300.0,36065.0,43211.232571,37787.5,32363.767429,32.432582,-1861.666357,100.000000,7.142857,0.0,2.0
2,A328130,루닛,20220907,903,33650.0,33750.0,33650.0,33650.0,319.0,34300.0,36065.0,43211.232571,37787.5,32363.767429,32.432582,-1861.666357,100.000000,7.142857,0.0,0.0
3,A328130,루닛,20220907,904,33700.0,33700.0,33650.0,33650.0,462.0,34300.0,36065.0,43211.232571,37787.5,32363.767429,32.432582,-1861.666357,100.000000,7.142857,0.0,0.0
4,A328130,루닛,20220907,905,33700.0,33700.0,33700.0,33700.0,589.0,34300.0,36065.0,43211.232571,37787.5,32363.767429,32.432582,-1861.666357,100.000000,7.142857,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77751,A328130,루닛,20230718,1517,176400.0,176600.0,176400.0,176500.0,237135.0,182540.0,181290.0,202514.664345,169110.0,135705.335655,60.367850,18578.260887,14.285714,78.571429,0.0,2.0
77752,A328130,루닛,20230718,1518,176500.0,176600.0,176400.0,176500.0,237532.0,182540.0,181290.0,202514.664345,169110.0,135705.335655,60.367850,18578.260887,14.285714,78.571429,0.0,0.0
77753,A328130,루닛,20230718,1519,176600.0,176600.0,176500.0,176600.0,237947.0,182540.0,181290.0,202514.664345,169110.0,135705.335655,60.367850,18578.260887,14.285714,78.571429,0.0,2.0
77754,A328130,루닛,20230718,1520,176600.0,176800.0,176500.0,176800.0,238596.0,182540.0,181290.0,202514.664345,169110.0,135705.335655,60.367850,18578.260887,14.285714,78.571429,0.0,2.0


In [319]:
resultHapDf.to_csv('./lstm 학습 데이터.csv', index=False, encoding='euc-kr')