使用 Hugging Face 框架建立 BERT 模型，針對金融領域文本進行情緒分析訓練。
可使用以下提供的資料集，或選擇其他適合的金融文本資料集。

In [15]:
import pandas as pd

def load_data(file_path):
    with open(file_path, 'r', encoding='latin1') as file:
        lines = file.readlines()
        data = [line.strip().split('@') for line in lines]
        df = pd.DataFrame(data, columns=['text', 'label'])
    return df

file_path = './data/FinancialPhraseBank/Sentences_AllAgree_short.txt'
df =load_data(file_path)

In [16]:
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['label'].map(label_map)
# label = NaN drop
df = df.dropna(subset=['label'])
print(df['label'].value_counts())


label
1.0    31
0.0    18
Name: count, dtype: int64


In [17]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.head()


Unnamed: 0,text,label
13,Nordea Group 's operating profit increased in ...,1.0
5,Consolidated net sales increased 16 % to reach...,1.0
44,UPM-Kymmene has generated seventeen consecutiv...,1.0
9,MegaFon 's subscriber base increased 16.1 % in...,1.0
4,Clothing retail chain Sepp+ï¿½l+ï¿½ 's sales i...,1.0


In [18]:
# 加載 BERT 模型與分詞器
# !pip install tf-keras

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
# 初始化分詞器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

class FinancialDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx): 
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
        ) 

        return {
            'text': text,
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'label': torch.tensor(label, dtype=torch.long)
        }
    

train_dataset = FinancialDataset(train_df, tokenizer, max_len=512)
test_dataset = FinancialDataset(test_df, tokenizer, max_len=512)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# 訓練模型
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results-short',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy='steps',  
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


100%|██████████| 10/10 [01:07<00:00,  6.71s/it]

{'train_runtime': 67.0816, 'train_samples_per_second': 1.163, 'train_steps_per_second': 0.149, 'train_loss': 0.8630114555358886, 'epoch': 2.0}





TrainOutput(global_step=10, training_loss=0.8630114555358886, metrics={'train_runtime': 67.0816, 'train_samples_per_second': 1.163, 'train_steps_per_second': 0.149, 'total_flos': 20522662318080.0, 'train_loss': 0.8630114555358886, 'epoch': 2.0})

In [20]:
trainer.evaluate()

# 保存模型
model.save_pretrained('./my_finance_model-short')
tokenizer.save_pretrained('./my_finance_model-short')


100%|██████████| 1/1 [00:00<00:00,  4.99it/s]


('./my_finance_model-short/tokenizer_config.json',
 './my_finance_model-short/special_tokens_map.json',
 './my_finance_model-short/vocab.txt',
 './my_finance_model-short/added_tokens.json')

In [23]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer,device=0)

predictions = classifier(
  [
    "Investor confidence has been shattered by the scandal involving top executives, leading to a sharp decline in share value.",
    # 「高層主管卷入醜聞，打碎了投資者的信心，導致股價急劇下跌。」
    "The corporation faced a severe liquidity crisis, failing to meet its short-term financial obligations.",
    # 「該公司面臨嚴重的流動性危機，未能滿足其短期財務義務。」
    "Due to unexpected regulatory changes, the company's projected profits for the year have been drastically reduced.",
    # 「由於意外的監管變更，公司當年的預期利潤大幅下調。」
    "The firm has declared bankruptcy following a catastrophic loss in market share and revenue.",
    # 「在市場份額和收入慘重損失後，該公司已宣布破產。」
    "Net sales surged by 18.5 % to EUR167 .8 m. Teleste said that EUR20 .4 m , or 12.2 % , of the sales came from the acquisitions made in 2009 .",
    # 「銷售額激增18.5%至EUR167.8m。Teleste表示，銷售額的12.2%來自2009年收購的銷售額。」
    "Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros ."
    # 「諾德銀行集團2010年的營業利潤同比增長18%至36.4億歐元，總收入同比增長3%至93.3億歐元。」
    ]
  )

print(predictions)


[{'label': 'LABEL_0', 'score': 0.6306755542755127}, {'label': 'LABEL_0', 'score': 0.634833037853241}, {'label': 'LABEL_0', 'score': 0.5938164591789246}, {'label': 'LABEL_0', 'score': 0.6194185018539429}, {'label': 'LABEL_0', 'score': 0.6260707378387451}, {'label': 'LABEL_0', 'score': 0.6286526918411255}]
