In [11]:
import PyPDF2
import os
import re
import pandas as pd

In [12]:
def extract_tables_and_pages(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        data = []  # This will store tuples of (pdf_path, title, page)

        # Regular expression to find table titles
        table_title_regex = re.compile(r'[Tt][Aa][Bb][Ll][Ee]+\.?+\s*[A-Z]?\d+\.?\s+[A-Z]+.*')

        for i in range(num_pages):
            page = reader.pages[i]
            text = page.extract_text()
            if text:
                found_titles = table_title_regex.findall(text)
                for title in found_titles:
                    data.append((pdf_path, title, i))  # Append the path, title, and page number

    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data, columns=['pdf_path', 'Table_title', 'Page'])
    df['Table_title'] = df['Table_title'].str.replace(r'\n', ' ', regex=True)
    return df

In [13]:
def extract_title_from_path(file_path):
    """Extract the title from the file name"""
    file_name_with_extension = file_path.split('/')[-1]
    # Remove the file extension
    title = file_name_with_extension.replace('.pdf', '')
    return title

In [14]:
#read all pdf files from a specific directory
directory = 'C:/Users/89751/OneDrive/desktop/Text-embedding classification/Document/'
files = os.listdir(directory)
pdf_files = [os.path.join(directory, file) for file in files]  #
pdf_files

['C:/Users/89751/OneDrive/desktop/Text-embedding classification/Document/test.pdf']

In [15]:
df = [extract_tables_and_pages(path) for path in pdf_files]
df = pd.concat(df, ignore_index=True)
df

Unnamed: 0,pdf_path,Table_title,Page
0,C:/Users/89751/OneDrive/desktop/Text-embedding...,Table 1 Key system design parameters.,1
1,C:/Users/89751/OneDrive/desktop/Text-embedding...,Table 2 Proximate and ultimate analyses of sam...,2
2,C:/Users/89751/OneDrive/desktop/Text-embedding...,Table 3 Composition of the crude syngas (vol %).,3
3,C:/Users/89751/OneDrive/desktop/Text-embedding...,Table 4 Input and output data of each unit of ...,5


In [16]:
#replace file names with the article titles
df['pdf_path'] = df['pdf_path'].apply(extract_title_from_path)
df = df.rename(columns={'pdf_path': 'title'})
df

Unnamed: 0,title,Table_title,Page
0,test,Table 1 Key system design parameters.,1
1,test,Table 2 Proximate and ultimate analyses of sam...,2
2,test,Table 3 Composition of the crude syngas (vol %).,3
3,test,Table 4 Input and output data of each unit of ...,5
