## Load the model

In [18]:
from transformers import BertModel, BertTokenizer
import os
import torch
import torch.nn as nn
from datasets import *
from torch.utils.data import DataLoader
from transformers import BertTokenizer, AdamW

In [2]:
# Define the model name and cache directory
model_name = "bert-base-uncased"
cache_dir = r"D:\VS Code Projects\bert training\hugging face\cache"
# Load the model and tokenizer
model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

## Load data

In [3]:
save_path = r"D:\VS Code Projects\bert training\datasets"
raw_dataset = load_from_disk(save_path)

Loading dataset from disk:   0%|          | 0/43 [00:00<?, ?it/s]

In [4]:
train_dataset = raw_dataset['train']
test_dataset = raw_dataset['test']

In [5]:
print(f"size of train_dataset: {len(train_dataset)}")
print(f"size of test_dataset: {len(test_dataset)}")

size of train_dataset: 67316227
size of test_dataset: 2965174


we can see that the datasets above are too large, so we need to sample a small part of them

In [6]:
from torch.utils.data import random_split

# define the sample ratio
sample_ratio = 0.00001

# calculate the sample size
train_size = int(len(train_dataset) * sample_ratio)
test_size = int(len(test_dataset) * sample_ratio)

# randomly sample the dataset
train_dataset, _ = random_split(train_dataset, [train_size, len(train_dataset) - train_size])
test_dataset, _ = random_split(test_dataset, [test_size, len(test_dataset) - test_size])

print(f"size of sampled train_dataset: {len(train_dataset)}")
print(f"size of sampled test_dataset: {len(test_dataset)}")

size of sampled train_dataset: 673
size of sampled test_dataset: 29


In [7]:
for i in range(10):
    print(train_dataset[i]['labels']['5d'])
    # test
for i in range(10):
    print(train_dataset[i]['sentence'])
    # test

0
1
1
0
1
0
0
1
1
1
The Company and the Bank must comply with regulatory capital requirements established by the Federal Reserve Board and FDIC.
In 1989, Congress enacted the Natural Gas Wellhead Decontrol Act, as amended (the “Decontrol Act”).
Deferred tax assets and liabilities are recognized for the estimated future tax consequences attributable to differences between the financial statement carrying amounts of existing assets and liabilities and their respective tax bases.
Prior to his appointment as Group President, Innerwear Americas, Mr. Upchurch served as our Executive Vice President and General Manager, Domestic Innerwear from January 2008 until December 2010 and as our Senior Vice President and General Manager, Intimate Apparel from July 2006 until December 2007.
Remediation Plan for Material Weakness in Internal Control Over Financial Reporting Management is taking steps to remediate this material weakness, including revamping our risk assessment process to better respond to

In [8]:
# test cuda
if torch.cuda.is_available():
    print("CUDA is available. Device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available. Device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [10]:
# 定义训练设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# 加载预训练模型
pretrained = BertModel.from_pretrained(r"D:\VS Code Projects\bert training\hugging face\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594", cache_dir=cache_dir).to(device)
print(pretrained)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

## Construct the network

In [12]:
# 定义下游model:进行二分类任务
# 冻结所有参数
for param in pretrained.parameters():
    param.requires_grad = False

# 只让最后两层可训练
for param in pretrained.encoder.layer[-2:].parameters():
    param.requires_grad = True

class Model(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.bert = pretrained  # 已经做了部分冻结处理
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 2)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # 这里不需要额外的 `with torch.no_grad()`，因为前几层已经被冻结
        out = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        # 取最后一层 [CLS] 向量
        out = out.last_hidden_state[:, 0]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

## Training stage

In [13]:
epochs = 1
# 定义训练轮数
lr = 1e-4
# 定义学习率

In [14]:
# 对数据进行编码处理 coding the datasets
def collate_fn(data):
    sentences = [i['sentence'] for i in data]
    labels = [i['labels']['5d'] for i in data]

    # 对文本进行编码
    data = tokenizer.batch_encode_plus(
        sentences, 
        padding='max_length', 
        truncation=True, 
        max_length=512, 
        return_tensors="pt",
        return_length=True
    )

    input_ids = data["input_ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    token_type_ids = data["token_type_ids"].to(device)
    labels =torch.LongTensor(labels).to(device)
    #将原始数字label转换为torch的类型

    return input_ids, attention_mask, token_type_ids, labels

In [15]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last= True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [20]:
# 指定参数结果保存路径
save_dir = r"D:\VS Code Projects\bert training\params"


Now, we can start training the model. We will use the AdamW optimizer and the CrossEntropyLoss loss function. The training process is as follows:

In [21]:
if __name__ == "__main__":
    model = Model(pretrained).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
            # grad to zero
            optimizer.zero_grad()

            output = model(input_ids, attention_mask, token_type_ids)

            loss = torch.nn.functional.cross_entropy(output, labels)

            loss.backward()
            # renew the parameters
            optimizer.step()

            if i % 10 == 0:
                out = output.argmax(dim=1)
                acc = (out == labels).sum().item() / len(labels)
                print(f"epoch: {epoch}, step: {i}, loss: {loss.item()}, accuracy: {acc}")

        # 在每个 epoch 结束时评估测试集精度
        model.eval()
        total_correct = 0
        total_samples = 0
        with torch.no_grad():
            for input_ids, attention_mask, token_type_ids, labels in test_loader:
                output = model(input_ids, attention_mask, token_type_ids)
                out = output.argmax(dim=1)
                total_correct += (out == labels).sum().item()
                total_samples += len(labels)

        test_accuracy = total_correct / total_samples
        print(f"epoch: {epoch}, test accuracy: {test_accuracy}")

    # Save the model state dictionary to a file
    torch.save(model.state_dict(), os.path.join(save_dir, f"{epoch}bert.pt"))
    print(epoch, "model has been saved successfully")



epoch: 0, step: 0, loss: 0.693869411945343, accuracy: 0.59375
epoch: 0, step: 10, loss: 0.6794849038124084, accuracy: 0.65625
epoch: 0, step: 20, loss: 0.6878824830055237, accuracy: 0.5625
epoch: 0, test accuracy: 0.6206896551724138
0 model has been saved successfully


In [23]:
model.load_state_dict(torch.load(r"D:\VS Code Projects\bert training\params\0bert.pt", map_location=device))
model.eval()

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [24]:
def preprocess_text(text):
    encoded_input = tokenizer(text, 
                              padding="max_length", 
                              truncation=True, 
                              max_length=128,  # 依据你的训练设置调整
                              return_tensors="pt")  # 以 PyTorch 格式返回
    return encoded_input["input_ids"].to(device), encoded_input["attention_mask"].to(device)

def predict_sentiment(text):
    input_ids, attention_mask = preprocess_text(text)

    with torch.no_grad():
        output = model(input_ids, attention_mask)

        if output.shape[-1] == 1:
            # **如果使用的是 BCEWithLogitsLoss()，需要 sigmoid 处理**
            prob = torch.sigmoid(output).item()
            predicted_class = 1 if prob > 0.5 else 0  # 二分类阈值设为 0.5
        else:
            # **如果使用的是 CrossEntropyLoss()，直接取 argmax**
            predicted_class = output.argmax(dim=1).item()

    return predicted_class

In [25]:

# **6. 测试几个句子**
examples = [
    "I love this movie! It was amazing.",  # 积极
    "This product is terrible. I hate it.",  # 消极
    "Not bad, but could be better.",  # 可能是积极或消极，取决于训练数据
]

label_map = {0: "Negative", 1: "Positive"}


In [50]:
positive_count = 0

for text in examples:
    result = predict_sentiment(text)
    sentiment = label_map[result]
    print(f"Text: {text}\nSentiment: {sentiment}\n")
    
    # 统计正向句子的频次
    if sentiment == "Positive":
        positive_count += 1

# 输出正向句子的频次
print(f"Number of positive sentences: {positive_count}")

Text: I love this movie! It was amazing.
Sentiment: Positive

Text: This product is terrible. I hate it.
Sentiment: Positive

Text: Not bad, but could be better.
Sentiment: Positive

Number of positive sentences: 3


In [33]:
import json
report_company = 'Fluence Energy'
report_ticker = 'FLNC'
report_years = ['2022','2023','2024']
report = {}
DATA_FOLDER = 'C:/Users/Kevin/OneDrive/Desktop/MQF学习资料/files of lessons/winter quarter/MGTF 423/final projects/data/'
for year in report_years:
    with open(DATA_FOLDER + report_company + year + '.json') as inputfile:
        report[year] = json.load(inputfile)
        print("report has been loaded from the json file")


report has been loaded from the json file
report has been loaded from the json file
report has been loaded from the json file


In [34]:
report['2022']

{'company': 'Fluence Energy',
 'year': '2022',
 'ticker': 'FLNC',

In [39]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm", disable=['ner'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [44]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

# PDF text extraction
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Others
import requests
import io
import string
import re
from pprint import pprint
from tqdm.notebook import tqdm
import io
import json
import numpy as np

# Import td-idf and count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
def remove_non_ascii(text):
    printable = set(string.printable)
    return ''.join(filter(lambda x: x in printable, text))
# 保留可打印字符

def not_header(line):
    # as we're consolidating broken lines into paragraphs, we want to make sure not to include headers
    return not line.isupper()
# 不包括大写字母

def extract_sentences(nlp, text):
    """
    Extracting ESG statements from raw text by removing junk, URLs, etc.
    We group consecutive lines into paragraphs and use spacy to parse sentences.
    """
    MIN_WORDS_PER_PAGE = 500
 # 每页最少500个单词（将封面或者大面积图像的页面剔除）   
    pages = text.split('##PAGE_BREAK##')
#     print('Number of Pages: {}'.format(len(pages)))

    lines = []
    for page in pages:
        
        # remove non ASCII characters
        text = remove_non_ascii(page)
        
        if len(text.split(' ')) < MIN_WORDS_PER_PAGE:
#             print('Skipped Page: {}'.format(len(text.split(' '))))
            continue
        
        prev = ""
        # aggregate consecutive lines where text may be broken down
        for line in text.split('\n\n'):
            # aggregate consecutive lines where text may be broken down
            # only if next line starts with a space or previous does not end with dot.
            if(line.startswith(' ') or not prev.endswith('.')):
                prev = prev + ' ' + line
            else:
                # new paragraph
                lines.append(prev)
                prev = line

        # don't forget left-over paragraph
        lines.append(prev)
        lines.append('##SAME_PAGE##')
        
    lines = '  '.join(lines).split('##SAME_PAGE##')
    
    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner
    
    sentences = []
    pages_content = []

    for line in lines[:-1]:
        # removing header number
        line = re.sub(r'^\s?\d+(.*)$', r'\1', line)
        # removing trailing spaces
        line = line.strip()
        # words may be split between lines, ensure we link them back together
        line = re.sub(r'\s?-\s?', '-', line)
        # remove space prior to punctuation
        line = re.sub(r'\s?([,:;\.])', r'\1', line)
        # ESG contains a lot of figures that are not relevant to grammatical structure
        line = re.sub(r'\d{5,}', r' ', line)
        # remove emails
        line = re.sub(r'\S*@\S*\s?', '', line)
        # remove mentions of URLs
        line = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line)
        # remove multiple spaces
        line = re.sub(r'\s+', ' ', line)
        # join next line with space
        line = re.sub(r' \n', ' ', line)
        line = re.sub(r'.\n', '. ', line)
        line = re.sub(r'\x0c', ' ', line)
        
        pages_content.append(str(line).strip())

        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append(str(part).strip())

#           sentences += nltk.sent_tokenize(line)
            
    # Only interested in full sentences and sentences with 10 to 100 words.
    sentences = [s for s in sentences if re.match('^[A-Z][^?!.]*[?.!]$', s) is not None]
    sentences = [s.replace('\n', ' ') for s in sentences]
    sentences = [s for s in sentences if (len(s.split(' ')) > 10) & (len(s.split(' ')) < 100)]

    return pages_content, sentences

In [46]:
for year in report_years:
    pages, sentences = extract_sentences(nlp, report[year]['content'])
    globals()[f'report_{year}_pages'] = pages
    globals()[f'report_{year}_sentences'] = sentences

In [47]:
report_2022_sentences[:100]

['No x Indicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.',
 'No x Indicate by check mark whether the registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports); and (2) has been subject to such filing requirements for the past 90 days.',
 'No o Indicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company.',
 'See the definitions of large accelerated filer, accelerated filer, smaller reporting company, and emerging growth company in Rule 12b-2 of the Exchange Act.',
 'If an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or 

In [53]:
for year in report_years:
    positive_count = 0
    for text in globals()[f'report_{year}_sentences']:
        result = predict_sentiment(text)
        sentiment = label_map[result]
        
        # 统计正向句子的频次
        if sentiment == "Positive":
            positive_count += 1

    # 输出正向句子的频次
    print(f"Year: {year}, Number of positive sentences: {positive_count}")

Year: 2022, Number of positive sentences: 1452
Year: 2023, Number of positive sentences: 1782
Year: 2024, Number of positive sentences: 1748


In [54]:
for year in report_years:
    negative_count = 0
    for text in globals()[f'report_{year}_sentences']:
        result = predict_sentiment(text)
        sentiment = label_map[result]
        
        # 统计正向句子的频次
        if sentiment == "Negative":
            negative_count += 1

    # 输出正向句子的频次
    print(f"Year: {year}, Number of negative sentences: {negative_count}")

Year: 2022, Number of negative sentences: 251
Year: 2023, Number of negative sentences: 280
Year: 2024, Number of negative sentences: 289


In [55]:
for year in report_years:
    rate_positive = positive_count / len(globals()[f'report_{year}_sentences'])
    rate_negative = negative_count / len(globals()[f'report_{year}_sentences'])
    print(f"Year: {year}, Positive Rate: {rate_positive}, Negative Rate: {rate_negative}")

Year: 2022, Positive Rate: 1.0264239577216676, Negative Rate: 0.16970052847915443
Year: 2023, Positive Rate: 0.8477206595538312, Negative Rate: 0.14015518913676042
Year: 2024, Positive Rate: 0.8581246931762395, Negative Rate: 0.14187530682376043
