In [None]:
import os
import re
import PyPDF2
import pandas as pd
import numpy as np
from pathlib import Path
from natsort import natsorted
from joblib import load
from openai import OpenAI

In [None]:
#set API Key from OpenAI
openai_api_key= "Add Your OpenAI API KEY Here."

In [None]:
def extract_tables_and_pages(pdf_path):
    """Extract table titles from the paper"""
    
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        data = []  # This will store tuples of (pdf_path, title, page)

        # Regular expression to find table titles
        table_title_regex = re.compile(r'[Tt][Aa][Bb][Ll][Ee]+\.?+\s*[A-Z]?\d+\.?\s+[A-Z]+.*')

        for i in range(num_pages):
            page = reader.pages[i]
            text = page.extract_text()
            if text:
                found_titles = table_title_regex.findall(text)
                for title in found_titles:
                    data.append((pdf_path, title, i + 1))  # Append the path, title, and page number

    # Create a DataFrame from the list of tuples
    df = pd.DataFrame(data, columns=['PDF_path', 'Table_title', 'Page'])
    df['Table_title'] = df['Table_title'].str.replace(r'\n', ' ', regex=True)
    return df

def extract_file_name_from_path(file_path):
    """Extract the file name"""
    
    file_name_with_extension = file_path.split('/')[-1]
    # Remove the file extension
    title = file_name_with_extension.replace('.pdf', '')
    return title

def generate_embeddings(text):
    """Initial the embedding model"""
    
    client = OpenAI(api_key=openai_api_key)
    
    # make api call
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    
    # return text embedding
    return response.data

def load_pdf_paths(pdf_directory):
    """Load PDF file paths."""
    
    files = [file for file in os.listdir(pdf_directory) if file.endswith(".pdf")]
    sorted_files = natsorted(files)  
    
    all_files = [str(Path(pdf_directory, file).as_posix()) for file in sorted_files]
    
    return all_files

def df_to_csv(df, file_path):
    """Write a DataFrame to a CSV file."""
    df.to_csv(file_path, index=False, escapechar='\\')

In [None]:
# Directory containing source PDF files to be processed
pdf_directory = "Add your PDF directory path here"
pdf_files = load_pdf_paths(pdf_directory )
pdf_files

In [None]:
# Extract table titles
df1 = [extract_tables_and_pages(path) for path in pdf_files]
df1 = pd.concat(df1, ignore_index=True)
df1.head()

In [None]:
# Generate embeddings for table title
text_embedding_list = generate_embeddings(df1['Table_title'])
text_embedding_list = [text_embedding_list[i].embedding for i in range(len(text_embedding_list))]

# Load the pre-trained random forest model
clf_loaded = load('random_forest_model.pkl')

embeddings_array = np.array(text_embedding_list)
# Make predictions
predictions = clf_loaded.predict(embeddings_array)

# Convert predicted results into a DataFrame
df2 = pd.DataFrame(predictions)
# Combine data
combined_df = pd.concat([df1, df2], axis=1).fillna(False)
combined_df.rename(columns={0: 'is LCI inventory table?'}, inplace=True)
combined_df.head()

In [None]:
# Filter LCI table title 
df_LCI = combined_df[combined_df["is LCI inventory table?"] == True]
df_LCI

In [None]:
# Output file path for saving the LCI table title
file_name = "Add your output CSV path here"   # This file will be used in Steps 3.2.
df_to_csv(df_LCI, file_name)