In [1]:
#GPT revised

import pandas as pd

# 定义一个函数来安全地解析每一行
def safe_parse_line(line):
    parts = line.split('|')
    if len(parts) == 6:  # 检查行是否正确地分成了6部分
        return parts
    else:
        # 如果行的格式不正确，可以选择跳过或处理它
        return None

# 逐行读取文件并手动处理
rows = []
with open('tat_qa.csv', 'r', encoding='utf-8') as file:
    for line in file:
        parsed_line = safe_parse_line(line.strip())
        if parsed_line:
            rows.append(parsed_line)

# 将处理后的行转换为 DataFrame
df_split = pd.DataFrame(rows, columns=['doc_name', 'q_uid', 'question', 'answer', 'answer_scale', 'answer_type'])

# 保存为新的 CSV 文件
df_split.to_csv('tat_qa_split.csv', index=False)

# 打印结果以验证
print(df_split.head())

# 打印总行数以验证
print(f"Total rows: {df_split.shape[0]}")


                              doc_name                             q_uid  \
0                             doc_name                             q_uid   
1  overseas-shipholding-group-inc_2019  bbdcf6da614f34fdb63995661c81613f   
2  overseas-shipholding-group-inc_2019  0bf2a781ac6044d4d9dd94bd6cc1f790   
3  overseas-shipholding-group-inc_2019  0da3eed3b582066de46f7d62c8383aca   
4  overseas-shipholding-group-inc_2019  097768406a84ad50931b8170bfb69b2b   

                                            question  \
0                                           question   
1  What benefits are provided by the company to q...   
2  What is the change in Interest cost on benefit...   
3  What is the average Interest cost on benefit o...   
4  In which year was Benefit obligation at beginn...   

                                              answer  answer_scale  \
0                                             answer  answer_scale   
1  ['certain postretirement health care and life ...              

In [2]:
grouped_df = df_split.groupby('doc_name').size().reset_index(name='count')
grouped_df = grouped_df.sort_values(by='count', ascending=False)
import os
folder_path = './tat_docs'  # 請替換成你實際的資料夾路徑

def get_file_size(doc_name):
    file_path = os.path.join(folder_path, doc_name+".pdf")
    if os.path.exists(file_path):
        return os.path.getsize(file_path)
    return None


grouped_df

Unnamed: 0,doc_name,count
74,international-business-machines-corp_2019,264
32,centurylink-inc_2019,252
142,stmicroelectronics_2019,252
25,bce-inc_2019,234
165,vodafone-group-plc_2019,222
...,...,...
52,facebook_2019,6
69,ichor-holdings-ltd_2019,6
119,parkervision_2019,6
60,george-weston-limited_2019,6


In [3]:
# 新增檔案大小欄位
grouped_df['file_size'] = grouped_df['doc_name'].apply(get_file_size)

# 計算檔案大小除以 count 的比值，一個count要花多少大小才能夠查詢到，越小越好
grouped_df['size_per_count'] = grouped_df['file_size'] / grouped_df['count'] 

# # 刪除包含 None 的行 (也就是含有 'pdf' 的檔案名)
grouped_df = grouped_df.dropna(subset=['file_size'])

# 按 count 列進行降序排序
grouped_df = grouped_df.sort_values(by='size_per_count', ascending=True)


In [4]:
# 選取前10大的檔案
top_10 = grouped_df.head(10)

# 計算 count 總數加總
total_count_sum = top_10['count'].sum()

# 找出包含 'pdf' 的文件名列表
pdf_list = top_10['doc_name'].tolist()

# 輸出結果
print("前10大的檔案 count 總數加總:", total_count_sum)
print("PDF 文件列表:", pdf_list)


前10大的檔案 count 總數加總: 1416
PDF 文件列表: ['gaslog-ltd_2019', 'stmicroelectronics_2019', 'teradyne-inc_2019', 'vmware-inc_2019', 'leidos-holdings_2019', 'finjan-holding-inc_2019', 'cornerstone-ondemand-inc_2019', 'mitek-systems_2019', 'oracle-corporation_2019', 'plexus-corp_2019']


In [5]:
# !pip install -r requirements.txt
!export PYTHONPATH=./

In [6]:
import fitz  # PyMuPDF

# 打開PDF文件
pdf_document = fitz.open("/Users/miachen/Desktop/PANTHEON_Millions/tat_docs/vmware-inc_2019.pdf")

number_of_pages = pdf_document.page_count

print(f"PDF頁數: {number_of_pages}")

# 關閉PDF文件
pdf_document.close()


PDF頁數: 161


In [6]:
#修改paddle的原始碼，把tools前面增加paddle.
from run_parse import run_parse
import time
import json
results = {}
#開始時間:11:30 ，開始價格：未知 (first doc)
#start:1.22 已經花了0.35元 暫停，環境檔案位置有問題，已修正
#start:1.35

api_key = "sk-None-38wdILNqVMe1RLcurKHfT3BlbkFJALAog4oQSCSmWL3fe8ie"
# gaslog-ltd_2019有一些表格沒有取代掉（把原本的md轉成html），之後再重新處裡．
# 以處理成功'stmicroelectronics_2019', 'teradyne-inc_2019', 'vmware-inc_2019', 'leidos-holdings_2019', 'finjan-holding-inc_2019',
pdfs = [ 'cornerstone-ondemand-inc_2019', 'mitek-systems_2019', 'oracle-corporation_2019', 'plexus-corp_2019']
for i in pdfs:
    start_time = time.time()
    name = "tat_docs/" + i + ".pdf"
    print(name)
    path, image_dict, GPT_CALL_COUNT, NOTSAMELENGTH = run_parse(name, api_key, finance=True, page=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    results[i] = {
        "path": path,
        "image_dict": image_dict,
        "start_time": start_time,
        "end_time": end_time,
        "time_elapsed": elapsed_time,
        "GPT_CALL_COUNT": GPT_CALL_COUNT,
        "NOTSAMELENGTH" : NOTSAMELENGTH,
    }
    
    print(results[i])

    # 生成文件名，包含当前时间戳
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{i}_{timestamp}.json"
    
    # 将当前的结果保存为JSON文件
    json_data = json.dumps(results[i], indent=4)
    with open(filename, "w") as json_file:
        json_file.write(json_data)


tat_docs/stmicroelectronics_2019.pdf
start processing:  tat_docs/stmicroelectronics_2019.pdf
stmicroelectronics_2019
parse page: 0
parse page: 1
parse page: 2
parse page: 3
parse page: 4
parse page: 5
parse page: 6
parse page: 7
parse page: 8
parse page: 9
parse page: 10
parse page: 11
parse page: 12
parse page: 13
parse page: 14
parse page: 15
parse page: 16
parse page: 17
parse page: 18
parse page: 19
parse page: 20
parse page: 21
parse page: 22
parse page: 23
parse page: 24
parse page: 25
parse page: 26
parse page: 27
parse page: 28
parse page: 29
parse page: 30
parse page: 31
parse page: 32
parse page: 33
parse page: 34
parse page: 35
parse page: 36
parse page: 37
parse page: 38
parse page: 39
parse page: 40
parse page: 41
parse page: 42
parse page: 43
parse page: 44
parse page: 45
parse page: 46
parse page: 47
parse page: 48
parse page: 49
parse page: 50
parse page: 51
parse page: 52
parse page: 53
parse page: 54
parse page: 55
parse page: 56
parse page: 57
parse page: 58
parse pa

KeyboardInterrupt: 

In [7]:
#修改paddle的原始碼，把tools前面增加paddle.
from run_parse import run_parse
import time
import json
results = {}
#開始時間:11:30 ，開始價格：未知 (first doc)
#start:1.22 已經花了0.35元 暫停，環境檔案位置有問題，已修正
#start:1.35

api_key = "sk-None-38wdILNqVMe1RLcurKHfT3BlbkFJALAog4oQSCSmWL3fe8ie"
# gaslog-ltd_2019有一些表格沒有取代掉（把原本的md轉成html），之後再重新處裡．
# 以處理成功'stmicroelectronics_2019', 'teradyne-inc_2019', 'vmware-inc_2019', 'leidos-holdings_2019', 'finjan-holding-inc_2019',
pdfs = [ 'cornerstone-ondemand-inc_2019', 'mitek-systems_2019', 'oracle-corporation_2019', 'plexus-corp_2019', "gaslog-ltd_2019"]
for i in pdfs:
    start_time = time.time()
    name = "tat_docs/" + i + ".pdf"
    print(name)
    path, image_dict, GPT_CALL_COUNT, NOTSAMELENGTH = run_parse(name, api_key, finance=True, page=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    results[i] = {
        "path": path,
        "image_dict": image_dict,
        "start_time": start_time,
        "end_time": end_time,
        "time_elapsed": elapsed_time,
        "GPT_CALL_COUNT": GPT_CALL_COUNT,
        "NOTSAMELENGTH" : NOTSAMELENGTH,
    }
    
    print(results[i])

    # 生成文件名，包含当前时间戳
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    filename = f"{i}_{timestamp}.json"
    
    # 将当前的结果保存为JSON文件
    json_data = json.dumps(results[i], indent=4)
    with open(filename, "w") as json_file:
        json_file.write(json_data)


tat_docs/cornerstone-ondemand-inc_2019.pdf
start processing:  tat_docs/cornerstone-ondemand-inc_2019.pdf
cornerstone-ondemand-inc_2019
PDF頁數: 100
parse page: 0
parse page: 1
parse page: 2
parse page: 3
parse page: 4
parse page: 5
parse page: 6
parse page: 7
parse page: 8
parse page: 9
parse page: 10
parse page: 11
parse page: 12
parse page: 13
parse page: 14
parse page: 15
parse page: 16
parse page: 17
parse page: 18
parse page: 19
parse page: 20
parse page: 21
parse page: 22
parse page: 23
parse page: 24
parse page: 25
parse page: 26
parse page: 27
parse page: 28
parse page: 29
parse page: 30
parse page: 31
parse page: 32
parse page: 33
parse page: 34
parse page: 35
parse page: 36
parse page: 37
parse page: 38
parse page: 39
parse page: 40
parse page: 41
parse page: 42
parse page: 43
parse page: 44
parse page: 45
parse page: 46
parse page: 47
parse page: 48
parse page: 49
parse page: 50
parse page: 51
parse page: 52
parse page: 53
parse page: 54
parse page: 55
parse page: 56
parse pag

In [10]:
!pip install pymupdf 

Collecting pymupdf
  Using cached PyMuPDF-1.24.9-cp310-none-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached PyMuPDF-1.24.9-cp310-none-macosx_11_0_arm64.whl (3.0 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.9
