使用 Hugging Face 框架建立 BERT 模型，針對金融領域文本進行情緒分析訓練。
可使用以下提供的資料集，或選擇其他適合的金融文本資料集。

In [82]:
import pandas as pd

def load_data(file_path):
    with open(file_path, 'r', encoding='latin1') as file:
        lines = file.readlines()
        data = [line.strip().split('@') for line in lines]
        df = pd.DataFrame(data, columns=['text', 'label'])
    return df

file_path = './data/FinancialPhraseBank/Sentences_50Agree.txt'
df =load_data(file_path)

In [83]:
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['label'].map(label_map)
# label = NaN drop
df = df.dropna(subset=['label'])
#確認 label 佔比
print(df['label'].value_counts())

# label 分佈調整, label 1 X 0.1，label 0 X 0.2
df = df.groupby('label').apply(lambda x: x.sample(frac=0.1 if x.name == 1 else 0.2)).reset_index(drop=True)
print(df['label'].value_counts())


label
1.0    1363
0.0     604
Name: count, dtype: int64
label
1.0    136
0.0    121
Name: count, dtype: int64


  df = df.groupby('label').apply(lambda x: x.sample(frac=0.1 if x.name == 1 else 0.2)).reset_index(drop=True)


In [84]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.head()


Unnamed: 0,text,label
75,"Operating profit was EUR -0.1 mn , down from E...",0.0
178,"Finnish Cargotec 's Kalmar , the business area...",1.0
38,Coca-Cola was the market leader of manufacture...,0.0
136,Finnish food company Raisio Oyj HEL : RAIVV sa...,1.0
60,55 workers in +àm+Ñl will be affected by the c...,0.0


In [85]:
# 加載 BERT 模型與分詞器
# !pip install tf-keras

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
# 初始化分詞器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

class FinancialDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx): 
        text = self.df.iloc[idx]['text']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
        ) 

        return {
            'text': text,
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'label': torch.tensor(label, dtype=torch.long)
        }
    

train_dataset = FinancialDataset(train_df, tokenizer, max_len=512)
test_dataset = FinancialDataset(test_df, tokenizer, max_len=512)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
# 訓練模型
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy='steps',  
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


  1%|          | 3/387 [01:28<3:08:36, 29.47s/it]
100%|██████████| 78/78 [28:34<00:00, 21.98s/it] 

{'train_runtime': 1714.7882, 'train_samples_per_second': 0.359, 'train_steps_per_second': 0.045, 'train_loss': 0.693968259371244, 'epoch': 3.0}





TrainOutput(global_step=78, training_loss=0.693968259371244, metrics={'train_runtime': 1714.7882, 'train_samples_per_second': 0.359, 'train_steps_per_second': 0.045, 'total_flos': 161813299046400.0, 'train_loss': 0.693968259371244, 'epoch': 3.0})

In [87]:
trainer.evaluate()

# 保存模型
model.save_pretrained('./my_finance_model')
tokenizer.save_pretrained('./my_finance_model')

100%|██████████| 4/4 [00:48<00:00, 12.11s/it]


('./my_finance_model/tokenizer_config.json',
 './my_finance_model/special_tokens_map.json',
 './my_finance_model/vocab.txt',
 './my_finance_model/added_tokens.json')

In [90]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0)

input_texts =   [
    "The international electronic industry company Elcoteq has laid off tens of employees.",
    # 「國際電子行業公司Elcoteq裁掉了數十名員工。」
      "The company's revenue has seen a significant increase of 20% this quarter, reflecting strong market demand and effective management strategies .",
    #「該公司本季度收入顯著增長了20%，反映了強勁的市場需求和有效的管理策略。
    "The firm has declared bankruptcy following a catastrophic loss in market share and revenue.",
    # 「在市場份額和收入慘重損失後，該公司已宣布破產。」
  ]

predictions = classifier(input_texts)

print(predictions)


[{'label': 'LABEL_0', 'score': 0.5869709849357605}, {'label': 'LABEL_1', 'score': 0.5321142077445984}, {'label': 'LABEL_0', 'score': 0.5428314805030823}]


In [92]:
result_df = pd.DataFrame(predictions)

result_df['text'] = input_texts
result_df['label'] = result_df['label'].map({'LABEL_0': 'Negative', 'LABEL_1': 'Positive'})

result_df = result_df[['text', 'label', 'score']]

result_df


Unnamed: 0,text,label,score
0,The international electronic industry company ...,Negative,0.586971
1,The company's revenue has seen a significant i...,Positive,0.532114
2,The firm has declared bankruptcy following a c...,Negative,0.542831
