<a href="https://colab.research.google.com/github/banshee0716/Financial-Big-Data-Analysis/blob/master/%E9%87%91%E8%9E%8D%E6%95%B8%E6%93%9A%E5%88%86%E6%9E%90W10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
# 引入必要的套件
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from typing import Dict, List

class FinancialSentimentAnalyzer:
    def __init__(self, model_name: str = "bert-base-uncased", num_labels: int = 3):
        """
        初始化金融情緒分析器

        Args:
            model_name: 使用的預訓練模型名稱
            num_labels: 分類標籤數量
        """
        # 設置設備 (GPU/CPU)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # 初始化分詞器和模型
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        ).to(self.device)

        # 標籤映射
        self.label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

    def load_data(self):
        """載入金融情緒數據集"""
        # 載入 Financial PhraseBank 數據集
        dataset = load_dataset("takala/financial_phrasebank", 'sentences_allagree')

        # 資料預處理和分詞
        def tokenize_function(examples):
            return self.tokenizer(
                examples["sentence"],
                padding="max_length",
                truncation=True,
                max_length=128
            )

        # 對數據集進行分詞處理
        encoded_dataset = dataset.map(tokenize_function, batched=True)

        # 分割訓練集和測試集
        full_dataset = encoded_dataset['train']
        train_indices, test_indices = train_test_split(
            list(range(len(full_dataset))),
            test_size=0.2,
            random_state=42
        )

        self.train_dataset = full_dataset.select(train_indices)
        self.test_dataset = full_dataset.select(test_indices)

    def train(self, num_epochs: int = 3, batch_size: int = 16):
        """
        訓練模型

        Args:
            num_epochs: 訓練輪數
            batch_size: 批次大小
        """
        # 定義訓練參數
        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=10,
            evaluation_strategy="epoch",
            report_to="none"
        )

        # 初始化數據整理器
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        # 初始化訓練器
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.test_dataset,
            data_collator=data_collator
        )

        # 開始訓練
        self.trainer.train()

    def evaluate(self) -> Dict:
        """評估模型性能"""
        return self.trainer.evaluate()

    def predict(self, texts: List[str]) -> List[str]:
        """
        對輸入文本進行預測

        Args:
            texts: 要預測的文本列表

        Returns:
            預測的情緒標籤列表
        """
        # 對輸入文本進行編碼
        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            return_tensors="pt"
        ).to(self.device)

        # 進行預測
        outputs = self.model(**encodings)

        # 獲取預測結果
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        # 將數字標籤轉換為文字標籤
        return [self.label_map[pred] for pred in predictions]

def main():
    # 創建分析器實例
    analyzer = FinancialSentimentAnalyzer()

    # 載入數據
    print("Loading data...")
    analyzer.load_data()

    # 訓練模型
    print("Training model...")
    analyzer.train()

    # 評估模型
    print("Evaluating model...")
    results = analyzer.evaluate()
    print("Evaluation results:", results)

    # 測試預測
    test_texts = [
        "The company's profit has increased significantly this quarter.",
        "The increase in costs negatively affected the revenue.",
        "The company's performance remained stable."
    ]

    predictions = analyzer.predict(test_texts)
    print("\nPredictions for test texts:")
    for text, prediction in zip(test_texts, predictions):
        print(f"Text: {text}")
        print(f"Sentiment: {prediction}\n")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading data...


README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Training model...




Epoch,Training Loss,Validation Loss
1,0.5868,0.441727


Epoch,Training Loss,Validation Loss
1,0.5868,0.441727
2,0.269,0.115365
3,0.1075,0.117738


Evaluating model...


Evaluation results: {'eval_loss': 0.11773820221424103, 'eval_runtime': 188.3787, 'eval_samples_per_second': 2.405, 'eval_steps_per_second': 0.154, 'epoch': 3.0}

Predictions for test texts:
Text: The company's profit has increased significantly this quarter.
Sentiment: positive

Text: The increase in costs negatively affected the revenue.
Sentiment: negative

Text: The company's performance remained stable.
Sentiment: positive

