In [1]:
import re
import pandas as pd
import fitz 
from transformers import BertTokenizer, BertForTokenClassification


In [2]:
pdf_path = 'data/GSK-esg-performance-report-2023.pdf'
dictionary_path = 'dictionary.xlsx'

# 加载ESG指标字典
dictionary_df = pd.read_excel('dictionary.xlsx')  # 加载你提供的ESG字典文件

In [5]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

text = extract_text_from_pdf(pdf_path)


In [3]:
# def extract_text_from_pdf(pdf_path):
#     doc = fitz.open(pdf_path)  # 打开PDF文件
#     text = ""
#     for page_num in range(doc.page_count):
#         page = doc.load_page(page_num)  # 加载每一页
#         text += page.get_text()  # 提取每一页的文本
#     doc.close()  # 关闭PDF文件
#     return text

In [6]:
# 加载BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

# 函数：使用BERT模型提取指标和对应的数值及单位
def extract_metric_value_with_bert(text, metric):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    # 使用BERT模型的输出匹配指标和数值
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    for token in tokens:
        if metric.lower() in token.lower():
            return extract_value_from_context(metric, text)
    return None

# 函数：从文本上下文中提取数值和单位
def extract_value_from_context(metric, text):
    pattern = rf'{metric}[^0-9]*([\d,.]+)\s*(\w+)?'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return {'value': match.group(1), 'unit': match.group(2), 'source': match.group(0)}
    return None

# 可信度评估
def assess_confidence_with_bert(result):
    confidence = 0
    if result['value']:
        confidence += 0.5
    if result['unit']:
        confidence += 0.5
    return confidence


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import tabula

# 从PDF文件中提取表格数据
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

# 显示提取的表格
for table in tables:
    print(table.head())

# 处理表格数据
for table in tables:
    text_table = table.to_string()
    for metric in dictionary_df['metric'].unique():
        result = extract_metric_value_with_bert(text_table, metric)
        if result:
            confidence = assess_confidence_with_bert(result)
            print(f"表格中提取: 指标: {metric}, 数据: {result['value']}, 单位: {result['unit']}, 可信度: {confidence}, 来源: {result['source']}")


  Our ESG focus areas    Our six commitments  \
0              Access      Make our products   
1                 NaN    available at value-   
2                 NaN  based prices that are   
3                 NaN    sustainable for our   
4                 NaN           business and   

                           Our metrics for 2023  \
0  – Progress towards our 2030 goal of reaching   
1  1.3 billion people in lower income countries   
2                             with our products   
3                                           NaN   
4                                           NaN   

                        Our progress in 2023  
0      In 2023, we reached 89 million people  
1  with our vaccines and antiretrovirals and  
2     made 989 million doses of our products  
3        available in lower income countries  
4                                        NaN  
                  Unnamed: 0 2020 2021 2022 2023 Unnamed: 1 Unnamed: 2
0       Community investment  NaN  NaN  NaN  NaN   

In [11]:
# 遍历字典中的指标并提取数据
output_data = []
for metric in dictionary_df['metric'].unique():
    result = extract_metric_value_with_bert(text, metric)
    if result:
        confidence = assess_confidence_with_bert(result)
        output_data.append({
            'Metric': metric,
            'Value': result['value'],
            'Unit': result['unit'],
            'Confidence': confidence,
            'Source': result['source']
        })

# 将提取的数据输出为DataFrame并保存为Excel
output_df = pd.DataFrame(output_data)
output_df.to_csv('extracted_esg_data.csv', index=False)


In [None]:
import fitz  # PyMuPDF 用于处理 PDF
import re
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification

# 加载BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

# 函数：从PDF文件中提取所有文本
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)  # 打开PDF文件
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # 加载每一页
        text += page.get_text()  # 提取每一页的文本
    doc.close()  # 关闭PDF文件
    return text

# 函数：使用BERT模型提取指标和对应的数值及单位
def extract_metric_value_with_bert(text, metric):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    # 使用BERT模型的输出匹配指标和数值
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    for token in tokens:
        if metric.lower() in token.lower():
            return extract_value_from_context(metric, text)
    return None

# 函数：从文本上下文中提取数值和单位
def extract_value_from_context(metric, text):
    pattern = rf'{metric}[^0-9]*([\d,.]+)\s*(\w+)?'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return {'value': match.group(1), 'unit': match.group(2), 'source': match.group(0)}
    return None

# 可信度评估
def assess_confidence_with_bert(result):
    confidence = 0
    if result['value']:
        confidence += 0.5
    if result['unit']:
        confidence += 0.5
    return confidence

# 提取PDF中的文本
pdf_path = 'path_to_your_pdf_file.pdf'  # 替换为你的PDF文件路径
text = extract_text_from_pdf(pdf_path)

# 加载ESG指标字典
dictionary_df = pd.read_excel('dictionary.xlsx')  # 加载你提供的ESG字典文件

# 遍历字典中的指标并提取数据
output_data = []
for metric in dictionary_df['metric'].unique():
    result = extract_metric_value_with_bert(text, metric)
    if result:
        confidence = assess_confidence_with_bert(result)
        output_data.append({
            'Metric': metric,
            'Value': result['value'],
            'Unit': result['unit'],
            'Confidence': confidence,
            'Source': result['source']
        })

# 将提取的数据输出为DataFrame并保存为Excel
output_df = pd.DataFrame(output_data)
output_df.to_excel('extracted_esg_data.xlsx', index=False)
