In [8]:
import fitz  # PyMuPDF 用于提取PDF文本
import pandas as pd
import camelot  # 用于表格提取
import re
from transformers import pipeline

def extract_tables_from_pdf(pdf_path):
    # 更改读取 PDF 文件的方式
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')  # 'stream' 适用于有线框的表格
    return tables

# 加载 BERT 模型进行数据抽取
def load_bert_model():
    nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
    return nlp

# 使用 BERT 模型提取文本中的数值和单位
def extract_with_bert(text, nlp):
    ner_results = nlp(text)
    entities = []
    for entity in ner_results:
        if entity['entity_group'] == 'MISC':  # BERT 模型识别到的相关实体
            entities.append((entity['word'], entity['score']))
    return entities

# 从PDF中提取文本
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# 从PDF中提取表格
def extract_tables_from_pdf(pdf_path):
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')  # 'stream' 适用于有线框的表格
    return tables

# 从表格中提取数据
def extract_data_from_tables(tables, dictionary):
    extracted_data = []
    for table in tables:
        df = table.df  # 转换为DataFrame
        for index, row in df.iterrows():
            for keyword in dictionary['key']:
                if keyword in row.to_string():  # 简单的关键词匹配
                    data_row = {
                        'Metric': dictionary[dictionary['key'] == keyword]['metric'].values[0],
                        'Data': extract_number_from_row(row),
                        'Unit': extract_unit_from_row(row),
                        'Confidence': calculate_confidence(row, keyword)
                    }
                    extracted_data.append(data_row)
    return extracted_data

# 提取行中的数值
def extract_number_from_row(row):
    numbers = re.findall(r'\d+\.?\d*', row.to_string())  # 提取所有数字
    return numbers[0] if numbers else None

# 提取行中的单位
def extract_unit_from_row(row):
    units = re.findall(r'(tons|kg|%)', row.to_string())  # 匹配常见的单位
    return units[0] if units else None

# 计算可信度
def calculate_confidence(row, keyword):
    confidence = 0.5  # 基础可信度
    if keyword in row.to_string():
        confidence += 0.3
    if extract_number_from_row(row):
        confidence += 0.2
    return confidence




In [None]:
# 调用主函数
pdf_path = 'data/Abbott-2022.pdf'
dictionary_path = 'dictionary.xlsx'

# 加载关键词字典
dictionary = pd.read_excel(dictionary_path)

# 加载BERT模型
nlp = load_bert_model()

# 从PDF中提取文本和表格
text = extract_text_from_pdf(pdf_path)
# tables = extract_tables_from_pdf(pdf_path)

# 使用BERT模型从文本中提取数据
bert_entities = extract_with_bert(text, nlp)
print("BERT 提取的实体：", bert_entities)

# 从表格中提取数据
# extracted_table_data = extract_data_from_tables(tables, dictionary)
# print("表格提取结果：", pd.DataFrame(extracted_table_data))

-------

In [4]:
import fitz  # PyMuPDF
import re
import pandas as pd

In [1]:
pdf_path = 'data/GSK-esg-performance-report-2023.pdf'
dictionary_path = 'dictionary.xlsx'

In [5]:
doc = fitz.open(pdf_path)

# 提取所有文本
text = ""
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text += page.get_text()

# 关闭PDF文件
doc.close()

# 查看提取的文本内容（前500字）
print(text[:500])


GLOBAL 
SUSTAINABILITY 
REPORT 2022
Sustainability is about operating responsibly 
to deliver long-term impact. At Abbott, we’re 
strengthening the resilience of our company so we 
can continue to shape the future of healthcare — 
to help more people live better, healthier lives.
In this report, we detail our progress against the 
goals of our 2030 Sustainability Plan. The data 
presented reflects 2022 performance unless 
otherwise stated.
We have aligned our reporting with the 
requirements of 


In [11]:
# 加载字典
dictionary_df = pd.read_excel(dictionary_path)

# 关键词匹配函数
def keyword_match(text, dictionary_df):
    matched_metrics = []
    for index, row in dictionary_df.iterrows():
        key = row['key']
        if re.search(rf'\b{re.escape(key)}\b', text, re.IGNORECASE):
            matched_metrics.append({
                'category': row['categories'],
                'topic': row['topic'],
                'metric': row['metric'],
                'key': key,
                'GRI': row['GRI']
            })
    return matched_metrics

matched_metrics = keyword_match(text, dictionary_df)
pd.DataFrame(matched_metrics).head()


Unnamed: 0,category,topic,metric,key,GRI
0,Environment,Climate Change,Carbon Emissions,Energy and Emissions,GRI 302-1:Energy consumption within the organi...
1,Environment,Climate Change,Carbon Emissions,Coal,GRI 302-1:Energy consumption within the organi...
2,Environment,Climate Change,Carbon Emissions,Energy intensity,GRI 302-3:Energy intensity
3,Environment,Climate Change,Carbon Emissions,Energy and Emissions,GRI 302-4:Reduction of energy consumption
4,Environment,Climate Change,Product Carbon Footprint,Waste Management,GRI 306-1:Waste generation and significant was...


In [None]:
# 数据提取正则表达式
def extract_data(text, metric_keyword):
    pattern = rf'({metric_keyword})[^0-9]*([\d,.]+)\s*(\w+)?'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return {'metric': match.group(1), 'value': match.group(2), 'unit': match.group(3)}
    return None

# 示例提取
for metric in matched_metrics:
    result = extract_data(text, metric['key'])
    if result:
        print(f"metric: {result['metric']}, value: {result['value']}, unit: {result['unit']}")


In [None]:
def assess_confidence(data):
    confidence = 0
    if data['value']:
        confidence += 0.5
    if data['unit']:
        confidence += 0.5
    return confidence

for metric in matched_metrics:
    result = extract_data(text, metric['key'])
    if result:
        confidence = assess_confidence(result)
        print(f"指标: {result['metric']}, 数据: {result['value']}, 单位: {result['unit']}, 可信度: {confidence}")


In [7]:
# 将提取结果保存为DataFrame
output_data = []
for metric in matched_metrics:
    result = extract_data(text, metric['key'])
    if result:
        confidence = assess_confidence(result)
        output_data.append({
            'Metric': result['metric'],
            'Value': result['value'],
            'Unit': result['unit'],
            'Confidence': confidence
        })

output_df = pd.DataFrame(output_data)
output_df.to_excel('output_data.xlsx', index=False)


---

In [6]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# 加载预训练的BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

# Tokenizer处理文本数据
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)

# 解析BERT模型的输出
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print(tokens)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['[CLS]', 'global', 'sustainability', 'report', '202', '##2', 'sustainability', 'is', 'about', 'operating', 'res', '##pon', '##si', '##bly', 'to', 'deliver', 'long', '-', 'term', 'impact', '.', 'at', 'abbott', ',', 'we', '’', 're', 'strengthening', 'the', 'res', '##ili', '##ence', 'of', 'our', 'company', 'so', 'we', 'can', 'continue', 'to', 'shape', 'the', 'future', 'of', 'healthcare', '—', 'to', 'help', 'more', 'people', 'live', 'better', ',', 'health', '##ier', 'lives', '.', 'in', 'this', 'report', ',', 'we', 'detail', 'our', 'progress', 'against', 'the', 'goals', 'of', 'our', '203', '##0', 'sustainability', 'plan', '.', 'the', 'data', 'presented', 'reflects', '202', '##2', 'performance', 'unless', 'otherwise', 'stated', '.', 'we', 'have', 'aligned', 'our', 'reporting', 'with', 'the', 'requirements', 'of', 'leading', 'environmental', ',', 'social', ',', 'and', 'governance', '(', 'es', '##g', ')', 'ratings', 'and', 'sustainability', 'indices', '(', 'available', 'in', 'the', 'appendix'

In [9]:
def extract_metric_value_with_bert(text, metric):
    # 通过BERT识别指标和数值
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    # 根据上下文匹配BERT输出，获取数值和单位
    # 这里需要结合具体任务微调模型进行优化
    metric_value = None
    for token, score in zip(tokens, outputs[0][0]):
        if token.lower() == metric.lower():
            metric_value = extract_value_from_context(token, text)
            break
    return metric_value

def extract_value_from_context(metric, text):
    # 简单的正则表达式提取匹配值和单位
    pattern = rf'{metric}[^0-9]*([\d,.]+)\s*(\w+)?'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return {'value': match.group(1), 'unit': match.group(2), 'source': match.group(0)}
    return None


In [12]:
def assess_confidence_with_bert(result):
    confidence = 0
    if result['value']:
        confidence += 0.5
    if result['unit']:
        confidence += 0.5
    return confidence

# 遍历匹配到的指标，提取数据和单位
for metric in dictionary_df['metric'].unique():
    result = extract_metric_value_with_bert(text, metric)
    if result:
        confidence = assess_confidence_with_bert(result)
        print(f"指标: {metric}, 数据: {result['value']}, 单位: {result['unit']}, 可信度: {confidence}, 来源: {result['source']}")


指标: Board, 数据: 80, 单位: Ethics, 可信度: 1.0, 来源: Board Oversight	
80
Ethics


In [14]:
import tabula

# 从PDF中提取所有表格
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

# 显示提取的表格
for table in tables:
    print(table.head())

# 进一步处理表格数据
for table in tables:
    text_table = table.to_string()
    for metric in dictionary_df['metric'].unique():
        result = extract_metric_value_with_bert(text_table, metric)
        if result:
            confidence = assess_confidence_with_bert(result)
            print(f"表格中提取: 指标: {metric}, 数据: {result['value']}, 单位: {result['unit']}, 可信度: {confidence}, 来源: {result['source']}")


  of healthcare. Our 2030 Sustainability Plan  Global Sustainability  \
0                                         NaN                   Team   
1        formalizes our commitment to improve         Leads strategy   
2    lives; embedding sustainability into our  implementation across   
3                                         NaN      global operations   
4    governance structure sets this Plan as a       and oversees ESG   

    Sustainability Goal         ESG Disclosures      Global Operations  \
0  Leads and Operations               Committee                Council   
1             Functions             Responsible      Oversees strategy   
2       Responsible for          for monitoring      execution for all   
3    managing execution  regulatory, legal, and      operations, using   
4   and enterprise-wide     financial reporting  internal assessments,   

         Global Citizenship  
0          Advisory Council  
1      External experts who  
2       provide guidance on  
3 

In [None]:
# 保存最终结果到Excel
output_data = []
for metric in dictionary_df['metric'].unique():
    result = extract_metric_value_with_bert(text, metric)
    if result:
        confidence = assess_confidence_with_bert(result)
        output_data.append({
            'Metric': metric,
            'Value': result['value'],
            'Unit': result['unit'],
            'Confidence': confidence,
            'Source': result['source']
        })

output_df = pd.DataFrame(output_data)
output_df.to_excel('extracted_esg_data.xlsx', index=False)
