In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로딩
data = pd.read_csv('finance_data.csv')

# 라벨 인코딩
label_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
data['labels'] = data['labels'].map(label_mapping)

# 데이터 준비
sentences = data['kor_sentence'].tolist()
labels = data['labels'].tolist()

# 데이터 분리 (train/test split)
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenizer와 시퀀스 패딩
tokenizer = Tokenizer(num_words=10000)  # 상위 10000개의 단어만 사용
tokenizer.fit_on_texts(sentences)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
max_len = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

# 모델 정의
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    GRU(64, return_sequences=True),
    GRU(32),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3가지 감정 분류를 위한 출력층
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(X_train_padded, np.array(y_train), epochs=5, batch_size=32, validation_split=0.2)

# 모델 평가
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_classes, target_names=['neutral', 'positive', 'negative']))

# 예측 함수
def predict_review(sentence, model, tokenizer, max_len):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    predicted_label = np.argmax(prediction, axis=1)[0]
    return predicted_label

# 예제 문장 예측
example_sentence = "오늘 주식시장은 긍정적인 반응을 보였습니다."
predicted_label = predict_review(example_sentence, model, tokenizer, max_len)
print(f"Predicted label: {['neutral', 'positive', 'negative'][predicted_label]}")
