In [1]:
import PyPDF2
import os
import re
import pandas as pd

In [8]:
def extract_tables_and_pages(pdf_path):
    """Extract table titles from the paper"""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        data = []  # This will store tuples of (pdf_path, title, page)

        # Regular expression to find table titles
        table_title_regex = re.compile(r'[Tt][Aa][Bb][Ll][Ee]+\.?+\s*[A-Z]?\d+\.?\s+[A-Z]+.*')

        for i in range(num_pages):
            page = reader.pages[i]
            text = page.extract_text()
            if text:
                found_titles = table_title_regex.findall(text)
                for title in found_titles:
                    data.append((pdf_path, title, i + 1))  # Append the path, title, and page number

    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data, columns=['pdf_path', 'Table_title', 'Page'])
    df['Table_title'] = df['Table_title'].str.replace(r'\n', ' ', regex=True)
    return df

In [9]:
def extract_title_from_path(file_path):
    """Extract the file name"""
    file_name_with_extension = file_path.split('/')[-1]
    # Remove the file extension
    title = file_name_with_extension.replace('.pdf', '')
    return title

In [10]:
def df_to_csv(df, file_path):
    """Write a DataFrame to a CSV file."""
    df.to_csv(file_path, index=False, escapechar='\\')

In [11]:
#read all pdf files from a specific directory
directory = 'C:/Users/89751/OneDrive/desktop/Document/'
files = os.listdir(directory)
pdf_files = [os.path.join(directory, file) for file in files]  #
pdf_files

['C:/Users/89751/OneDrive/desktop/Document/1.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/10.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/11.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/12.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/13.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/14.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/15.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/16.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/17.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/18.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/19.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/2.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/20.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/21.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/22.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/23.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/24.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/25.pdf',
 'C:/Users/89751/OneDrive/desktop/Document/26.pdf',
 'C:/Users/897

In [12]:
df = [extract_tables_and_pages(path) for path in pdf_files]
df = pd.concat(df, ignore_index=True)
df

Unnamed: 0,pdf_path,Table_title,Page
0,C:/Users/89751/OneDrive/desktop/Document/1.pdf,Table 1 The amount of CBM drainage and utiliza...,5
1,C:/Users/89751/OneDrive/desktop/Document/1.pdf,Table 3 Transport mode and fuel mix [29].,6
2,C:/Users/89751/OneDrive/desktop/Document/1.pdf,Table 2 Loss rate in coal washing and selectio...,6
3,C:/Users/89751/OneDrive/desktop/Document/1.pdf,Table 7 Emission and mitigation by building ma...,7
4,C:/Users/89751/OneDrive/desktop/Document/1.pdf,"Table 8 Results of CO 2,eof each process in th...",7
...,...,...,...
169,C:/Users/89751/OneDrive/desktop/Document/8.pdf,Table 6 LCA endpoint results of energy source ...,10
170,C:/Users/89751/OneDrive/desktop/Document/9.pdf,table 1. CO 2from biogas and ﬂue gases (cement...,3
171,C:/Users/89751/OneDrive/desktop/Document/9.pdf,Table 1 Potential CO 2sources in Germany,4
172,C:/Users/89751/OneDrive/desktop/Document/9.pdf,table 2. Biogas and cement production itself a...,4


In [13]:
#replace file names with the article titles
df['pdf_path'] = df['pdf_path'].apply(extract_title_from_path)
df = df.rename(columns={'pdf_path': 'title'})
df

Unnamed: 0,title,Table_title,Page
0,1,Table 1 The amount of CBM drainage and utiliza...,5
1,1,Table 3 Transport mode and fuel mix [29].,6
2,1,Table 2 Loss rate in coal washing and selectio...,6
3,1,Table 7 Emission and mitigation by building ma...,7
4,1,"Table 8 Results of CO 2,eof each process in th...",7
...,...,...,...
169,8,Table 6 LCA endpoint results of energy source ...,10
170,9,table 1. CO 2from biogas and ﬂue gases (cement...,3
171,9,Table 1 Potential CO 2sources in Germany,4
172,9,table 2. Biogas and cement production itself a...,4


In [15]:
file_path = "C:/Users/89751/OneDrive/Desktop/PDF_title.csv"
df_to_csv(df, file_path)