In [4]:
!pip install tensorflow-addons
!pip install transformers

import os
from google.colab import drive
drive.mount('/content/drive/')

import sys
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import urllib.request
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print("GPU 작동 중")
  mirrored_strategy = tf.distribute.MirroredStrategy()
else:
  print("GPU 미작동 중")

news = pd.read_csv('./drive/MyDrive/Colab Notebooks/data/News_With_StockCode.csv') 
news['date'] = pd.to_datetime(news['date'], format='%Y-%m-%d %H:%M:%S', errors='raise')

MODEL_NAME = "klue/bert-base"
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, from_pt=True)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

MAX_SEQ_LEN = 64

def convert_data(X_data):
    tokens, masks, segments, targets = [], [], [], []
    
    for X in tqdm(X_data):
        token = tokenizer.encode(X, truncation = True, padding = 'max_length', max_length = MAX_SEQ_LEN)
        
        num_zeros = token.count(0)
        mask = [1] * (MAX_SEQ_LEN - num_zeros) + [0] * num_zeros
        
        segment = [0]*MAX_SEQ_LEN

        tokens.append(token)
        masks.append(mask)
        segments.append(segment)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)

    return [tokens, masks, segments]

x_data = news['title']
x_data_converted = convert_data(x_data)

token_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_word_ids')
mask_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_masks')
segment_inputs = tf.keras.layers.Input((MAX_SEQ_LEN,), dtype = tf.int32, name = 'input_segment')
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])
bert_output = bert_outputs[0]

DROPOUT_RATE = 0.5
NUM_CLASS = 3
dropout = tf.keras.layers.Dropout(DROPOUT_RATE)(bert_output)

sentiment_layer = tf.keras.layers.Dense(NUM_CLASS, activation='softmax', kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02))(dropout)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_layer)

OPTIMIZER_NAME = 'RAdam'
LEARNING_RATE = 5e-5
TOTAL_STEPS = 10000
MIN_LR = 1e-5
WARMUP_PROPORTION = 0.1
EPSILON = 1e-8
CLIPNORM = 1.0
optimizer = tfa.optimizers.RectifiedAdam(learning_rate = LEARNING_RATE,
                                          total_steps = TOTAL_STEPS, 
                                          warmup_proportion = WARMUP_PROPORTION, 
                                          min_lr = MIN_LR, 
                                          epsilon = EPSILON,
                                          clipnorm = CLIPNORM)

sentiment_model.compile(optimizer = optimizer, 
                        loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
                        metrics = ['accuracy'])

BEST_MODEL_NAME = './drive/MyDrive/Colab Notebooks/data/model/wine/best_model.h5' 
sentiment_model_best = tf.keras.models.load_model(BEST_MODEL_NAME,
                        custom_objects={'TFBertForSequenceClassification': TFBertForSequenceClassification})

# (중립 : 0, 긍정 : 1, 부정 : 2)
predicted_value = sentiment_model_best.predict(x_data_converted)
predicted_label = np.argmax(predicted_value, axis = 1)

samsung = pd.read_csv('./drive/MyDrive/Colab Notebooks/data/samsung.csv') 
samsung['date'] = pd.to_datetime(samsung['date'], format='%Y-%m-%d %H:%M:%S', errors='raise') 
samsung['label'] = np.nan 

count = 0
r_num = []
for i in range(len(news['date'])):
  if news['code'][i] == 'a005930':
    count = count + 1
    r_num.append(i)

# 수동라벨링
pd.set_option('mode.chained_assignment',  None) 
s_num = [198, 240, 247, 268, 268, 299, 310, 548, 562, 607, 772, 772, 772, 795, 855,
         874, 1052, 1150, 1249, 1265, 1276, 1276, 1458, 1500, 1514, 1523, 1556, 1598,
         1610, 1626, 1701, 1709, 1814, 1822, 1878, 2195, 2200, 2200, 2202, 2214, 2344,
         2354, 2466, 2540, 2564, 2640, 2676, 2689, 2704, 2707, 2711, 2788, 2788, 2816,
         2928, 2970, 3085, 3096, 3250, 3250, 3292, 3432, 3446, 3558, 3572, 3600, 3635,
         3642, 3684, 3698, 3768, 3782, 3850, 3853, 3866, 3866, 3866, 3894, 3922, 3922,
         3922, 3950, 4272, 4272, 4356, 4538, 4599, 4645, 4668, 4776, 4790, 4790, 4809,
         4860, 5084, 5280, 5308, 5336, 5378, 5427, 5578, 5588, 5608, 5616, 5686, 5686,
         5686, 5693, 5700, 5714, 5719, 5763, 5881, 5882, 5885, 5902, 5924, 5952, 5980,
         5980, 5987, 6008, 6012, 6050, 6064, 6064, 6078, 6084, 6106, 6232, 6277, 6285,
         6288, 6302, 6330, 6393, 6438, 6442, 6448, 6526]
for i in range(16):
  samsung['label'][s_num[i]] = predicted_label[r_num[i]]

samsung = pd.get_dummies(samsung) 
samsung = samsung.fillna(samsung.mean()) 

df1_corr = samsung.corr() 
df1_corr_sort = df1_corr.sort_values('close', ascending = False) 
df1_corr_sort['close'].head(8) 

cols_train = ['ma14', 'bol_upper', 'bol_down', 'ma70', 'ma140', 'label'] 
X_train_pre = samsung[cols_train] 
y = samsung['close'].values 
X_train, X_test, y_train, y_test = train_test_split(X_train_pre, y, test_size = 0.1) 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 9.0 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.19.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 19.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 57.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 11.3 MB/s

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

100%|██████████| 1725/1725 [00:00<00:00, 4443.73it/s]




  samsung = samsung.fillna(samsung.mean())


close        1.000000
ma14         0.998866
bol_upper    0.997706
bol_down     0.996209
ma70         0.993647
ma140        0.986819
nasdaq100    0.797879
rsi196       0.142332
Name: close, dtype: float64