In [5]:
import os
import shutil
import ast
import pandas as pd
import pymupdf4llm
import time
from langchain_openai import ChatOpenAI
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
from PyPDF2 import PdfReader
from langchain_core.prompts import ChatPromptTemplate

In [6]:
#set API Key from OpenAI
openai_api_key= os.environ.get("OPENAI_API_KEY_SBR")

In [7]:
def load_the_document(pdf_path):
    """load the document based on the given page number"""

    #load the document
    docs = pymupdf4llm.to_markdown(pdf_path)
    
    return docs

In [8]:
start_time = time.time()

pdf_path = 'C:/Users/89751/OneDrive/Desktop/2.pdf'
data = load_the_document(pdf_path)

# End the timer
end_time = time.time()

# Calculate the run time
run_time = end_time - start_time
print(f"Run time: {run_time} seconds")

Run time: 63.25249195098877 seconds


In [9]:
import time
from PyPDF2 import PdfReader

# Start the timer
start_time = time.time()

# Your code
reader = PdfReader(pdf_path)
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# End the timer
end_time = time.time()

# Calculate the run time
run_time = end_time - start_time
print(f"Run time: {run_time} seconds")

Run time: 28.135517120361328 seconds


In [3]:
def LCA_information_extraction(pdf_path, persist_directory):
    """Extract the system boundary, functional unit, target product, LCIA methods, and impact category from the given paper"""
    
    #read the document
    reader = PdfReader(pdf_path)
    raw_text = ''
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text
    
    #split the text into smaller chunks
    text_splitter = CharacterTextSplitter(        
        separator = "\n\n",
        chunk_size = 2000,
        chunk_overlap  = 200,
        length_function = len,
    )
    texts = text_splitter.split_text(raw_text)
    
    #create a Chroma vector store, specifying the persistence directory
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    
    #create a Chroma storage from texts
    docstorage = Chroma.from_texts(texts, embeddings, persist_directory=persist_directory)
    
    #initial the LLM model
    llm = ChatOpenAI(
        model_name="gpt-4-1106-preview",
        temperature=0,
        max_tokens=4000,
        openai_api_key=openai_api_key
    )
    
    system_prompt = (
        """
        You are a helpful assistant specializing in extracting life cycle assessment (LCA) data. 
        Based on the provided context, use the definitions below to answer the question at the end. 
        If you are unsure of the answer, state "I don't know" rather than conjecture.
        """
    )
    
    context = """
        **System boundary**: is defined as the scope of the LCA, which specifies the life cycle stages and processes that are considered in LCA studies. Common system boundaries are cradle-to-gate, gate-to-gate, and cradle-to-grave:
            - Cradle-to-gate: from the raw material extraction (cradle) up to the factory gate, includes raw material extraction, transport, and manufacturing.
            - Cradle-to-grave: covers the entire product lifecycle from raw material extraction (cradle) to the final disposal (grave). 
            - Gate-to-gate: only includes the production stage.
        **Functional unit (FU)**: is a quantified description of the performance of the product. The purpose of the functional unit is to provide a reference to which the inputs and outputs can be related.
        **Life Cycle Impact Assessment (LCIA) method**: is defined as the method to quantify the environmental impacts of the product system. It analyzes the data from the inventory phase and translates it into meaningful impact categories. The common LCIA method includes IPCC, ReCiPe, TRACI, CML, EF and Impact 2002+.
        **Impact category**: is a set of categories that represent a different type of potential environmental impact of a product system. These impact categories are selected based on the chosen impact assessment method. The common impact categories include global warming potentials, ozone depletion, eutrophication, acidification, human toxicity, and photochemical ozone formation. 
        
        ### Instructions for Data Extraction: 
        - Identify and list all information with minimum detail to understand their relationships. 
        - Is really IMPORTANT to extract all data from the given article and ensure completeness and precision in your extraction. 
        - Avoid extra explanations and format your response as follows:
            - "System boundary": [] # List format 
            - "Functional unit": [] # List format
            - "Impact assessment method": [] # List format
            - "Impact category": [] # List format
        - If you cannot find information about the system boundary, functional unit, allocation method, or impact assessment method, label them as "Not mentioned."
    
        ###Examples:
        1. Context: <The “cradle to grave” is defined as the system boundary of all the six comparison scenarios, which covers the material and energy production chain and all processes from the raw material extraction through the production, transportation, and use phase up to the product's end of life treatment. In this study, the functional unit is defined as 1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables. The socalled CML 2001 method is applied to LCIA calculation. >
        Answer: 
        - [“System boundary”: “Cradle to grave”]
        - [“Functional unit”: “1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables”]
        - [“Impact assessment method”: “CML 2001”]
        2. Context: <As indicated in Fig. 1, the scope of the study is from ‘cradle to gate’, with two main stages considered: biomass supply (cultivation, collection and transportation to the processing plant); and production of bio-ethylene and its co-products. The functional unit is defined as the production of 1 tonne of ethylene. The SimaPro v.8.3. software (Pré Consultants B.V., 2017) has been used for the life cycle modelling and the impacts have been calculated following the CML 2 method (Guinée et al., 2001), using the April 2016 update.>
        Answer:
        - [“System boundary”: “Cradle to gate”]
        - [“Functional unit”: “1 tonne of ethylene”]
        - [“Impact assessment method”: “CML 2”]
        3. Context: <The LCA model has a cradle-to-gate scope, and the system boundary includes sugarcane farming, bagasse transportation, size reduction, pretreatment, enzymatic hydrolysis, fermentation, and downstream separation, as shown in Fig. 2. 1 kg of lactic acid is the functional unit. OpenLCA 1.9 is used to create a product system. ReCiPe 2016 methodology with the hierarchist perspective, commonly used in LCA literature (Hiloidhari et al., 2020), is implemented.>
        - [“System boundary”: “Cradle-to-gate”]
        - [“Functional unit”: “1 kg of lactic acid”]
        - [“Impact assessment method”: “ReCiPe 2016”]
    
    """
    
    # Create a chat prompt template
    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", context),
            ("user", "Response to the question: {question}")
        ]
    )
    
    chain_type_kwargs = {
        "prompt": prompt_template,
        "document_variable_name": "question"
    }
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, 
        retriever=docstorage.as_retriever(), 
        chain_type_kwargs=chain_type_kwargs
    )
    
    question = "Extract the system boundary, functional unit, impact assessment method, and impact category for me."
    result = qa_chain({"query": question})
    
    return result

In [4]:
def convert_to_dataframe (raw_text):
    """Convert results into the structured dataframe"""
    
    llm = ChatOpenAI(
        model_name="gpt-4-1106-preview",
        temperature=0,
        max_tokens=4000,
        openai_api_key=openai_api_key
    )
    
    template = """
    You are a helpful assistant specializing in converting life cycle assessment (LCA) data format. Based on the provided context, use the definitions below to answer the question at the end.

    ### Convert the provided string into a structured format as follows: 
    Data_list = [{{"System boundary": "", "Functional unit": "", "Impact assessment method": "", "Impact category": "[]"}}

    
    ###Examples:
    1. Question: <'- "System boundary": ["Cradle-to-gate"]\n- "Functional unit": ["1 kg of ethylene"]\n- "Impact assessment method": ["CML-impact assessment baseline methodology"]\n- "Impact category": ["Global warming potential (GWP)", "Ozone depletion potential", "Photochemical oxidant creation potential", "Acidification potential", "Eutrophication potential"]'>
    Answer: 
    data_list = [
    {{"System boundary": "Cradle-to-gate", "Functional unit": "1 kg of ethylene", "Impact assessment method": "CML-impact assessment baseline methodology", "Impact category": ["Global warming potential (GWP)", "Ozone depletion potential", "Photochemical oxidant creation potential", "Acidification potential", "Eutrophication potential"]}}
    ]
    2. Question: <'- "System boundary": ["Cradle to grave"]\n- "Functional unit": ["1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables"]\n- "Impact assessment method": ["CML 2001"]\n- "Impact category": ["Global warming potential", "Acidification potential", "Ozone depletion potential", "Photochemical ozone creation potential", "Eutrophication potential", "Abiotic depletion potential"]'>
    Answer: 
    data_list = [
    {{"System boundary": "Cradle to grave", "Functional unit": "1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables", "Impact assessment method": "CML 2001", "Impact category": ["Global warming potential", "Acidification potential", "Ozone depletion potential", "Photochemical ozone creation potential", "Eutrophication potential", "Abiotic depletion potential"]}}
    ]
    3. Question: <'- "System boundary": ["Cradle to gate"]\n- "Functional unit": ["1 ton of ethylene glycol product"]\n- "Impact assessment method": ["Not mentioned"]\n- "Impact category": ["Global warming potential (GWP)", "Acidification potential (AP)", "Photochemical ozone creation potential (POCP)", "Nutrient enrichment (NE)", "Soot and ashes (SA) expressed in terms of PM10 emissions"]'>
    Answer: 
    data_list = [
    {{"System boundary": "Cradle to gate", "Functional unit": "1 ton of ethylene glycol product", "Impact assessment method": "Not mentioned", "Impact category": ["Global warming potential (GWP)", "Acidification potential (AP)", "Photochemical ozone creation potential (POCP)", "Nutrient enrichment (NE)", "Soot and ashes (SA) expressed in terms of PM10 emissions"]}}
    ]
    """
        
    prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        ("human", "Respond to question: {question}")
    ])

    # Insert a question into the template and call the model
    
    query = f"Convert the list {raw_text} into defined data format"
    full_prompt = prompt_template.format_messages(question=query)
    result = llm.invoke(full_prompt)

    data_string = result.content
    expression = data_string.split('=')[1].strip()
    data_list = ast.literal_eval(expression)

    return data_list

In [5]:
def read_pdf_files (directory):
    """Load all file paths from the directory"""
    
    files = os.listdir(directory)
    all_files = [os.path.join(directory, file) for file in files]
    
    return all_files

In [6]:
def process_pdf (all_file_path, base_persist_directory, number_of_papers):
    """Processing each paper for data extraction"""
    
    base_persist_directory = base_persist_directory

    results = []

    for i in range(number_of_papers):  #iterate over all_files
        pdf_path = all_file_path[i]
        persist_directory = os.path.join(base_persist_directory, str(i + 1))
    
        result = LCA_information_extraction(pdf_path, persist_directory)
        raw_text = result["result"]
    
        data = convert_to_dataframe(raw_text)
        
        # Add the PDF path to the first column
        for entry in data:
            entry['PDF Path'] = pdf_path  # Adding pdf_path as the first column
    
        results.append(data)

    all_result = [item for sublist in results for item in sublist]
        
    return all_result

In [7]:
def remove_vectordb (base_persist_directory, number_of_papers):
    """Remove vectordb from the given directory"""

    for i in range(number_of_papers):  # Iterate over all files
        persist_directory = os.path.join(base_persist_directory, str(i + 1))
        
        #remove the persist_directory and all its contents
        if os.path.exists(persist_directory):
            shutil.rmtree(persist_directory)
            print(f"Removed directory: {persist_directory}")

In [8]:
def df_to_csv(df, file_name):
    """Write a DataFrame to a CSV file"""
    df.to_csv(file_name, index=False, escapechar='\\')

In [9]:
def csv_to_df(file_name):
    """Read a CSV file into a DataFrame."""
    return pd.read_csv(file_name)

In [10]:
#remove vectordb from the directory
number_of_papers = 50
base_persist_directory = 'C:/Users/89751/LangChain-Practise/embedding/chroma/'  #directory of the vector database
remove_vectordb (base_persist_directory, number_of_papers)

Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/1
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/2
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/3
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/4
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/5
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/6
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/7
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/8
Removed directory: C:/Users/89751/LangChain-Practise/embedding/chroma/9


In [11]:
#load all pdf file paths
folder_directory = "C:/Users/89751/OneDrive/Desktop/Document2/"  #enter the paper folder path
all_file_path = read_pdf_files(folder_directory)
all_file_path

['C:/Users/89751/OneDrive/Desktop/Document2/1.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/2.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/3.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/4.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/5.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/6.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/7.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/8.pdf',
 'C:/Users/89751/OneDrive/Desktop/Document2/9.pdf']

In [12]:
#extract LCA information
base_persist_directory = 'C:/Users/89751/LangChain-Practise/embedding/chroma/'  #directory of the vector database
number_of_papers = 9
all_file_path = all_file_path

all_result = process_pdf (all_file_path, base_persist_directory, number_of_papers)

  warn_deprecated(
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


In [15]:
df1 = pd.DataFrame(all_result)
df1

Unnamed: 0,System boundary,Functional unit,Impact assessment method,Impact category,PDF Path
0,Cradle to gate,100 tonne/day of FDCA,ReCiPe 1.1 hierarchist,"[Agricultural land occupation (ALO), Climate c...",C:/Users/89751/OneDrive/Desktop/Document2/1.pdf
1,Cradle-to-gate,1.0 kg of FDCA,ReCiPe 1.13,"[Climate change, Fossil depletion, Metal deple...",C:/Users/89751/OneDrive/Desktop/Document2/2.pdf
2,Cradle-to-gate,1.0 kg of TPA,ReCiPe 1.13,"[Climate change, Fossil depletion, Metal deple...",C:/Users/89751/OneDrive/Desktop/Document2/2.pdf
3,Cradle-to-gate,1.0 kg of FDCA,Recipe 1.13 Hierarchist,"[Climate change (CC), Fossil depletion (FD)]",C:/Users/89751/OneDrive/Desktop/Document2/3.pdf
4,Cradle to gate,1 kg of FDCA/h at the factory gate,ReCiPe 1.12 hierarchist,"[Climate change (CC), Ozone depletion (OD), Te...",C:/Users/89751/OneDrive/Desktop/Document2/4.pdf
5,Cradle-to-gate,1 kg/h of FDCA,ReCiPe 1.1 hierarchist,"[Global warming, Ozone depletion, Ozone format...",C:/Users/89751/OneDrive/Desktop/Document2/5.pdf
6,From raw material (corn) delivery to the plant...,1 kg of citric acid,TRACI 2.1,"[Global warming potential, Acidification poten...",C:/Users/89751/OneDrive/Desktop/Document2/6.pdf
7,Cradle to gate,1000 kg (1 ton) of sorbitol,ReCiPe,"[Climate change (CC), Ozone depletion (OD), Te...",C:/Users/89751/OneDrive/Desktop/Document2/7.pdf
8,Cradle-to-gate,1 metric ton of pX produced,ReCiPe v1.0,"[Ozone depletion, Human toxicity, Ecotoxicity,...",C:/Users/89751/OneDrive/Desktop/Document2/8.pdf
9,Cradle-to-gate,1 metric ton of p-Xylene,ReCiPe midpoint method,"[Climate change, Water depletion, Land occupat...",C:/Users/89751/OneDrive/Desktop/Document2/9.pdf


In [16]:
df2 = pd.read_csv("C:/Users/89751/OneDrive/Desktop/LCA ontology/Paper_output.csv", encoding='ISO-8859-1')
df2

Unnamed: 0,ID,Title,Author,PDF Path
0,1,Technoeconomic and Life-Cycle Assessment for E...,Patel et al.,C:/Users/89751/OneDrive/Desktop/Document2/1.pdf
1,2,Sustainable Production of Bioplastics from Lig...,Kim et al.,C:/Users/89751/OneDrive/Desktop/Document2/2.pdf
2,3,"Integrative technical, economic, and environme...",Kim et al.,C:/Users/89751/OneDrive/Desktop/Document2/3.pdf
3,4,Environmental sustainability assessment of HMF...,Bello et al.,C:/Users/89751/OneDrive/Desktop/Document2/4.pdf
4,5,Towards improving the sustainability of biopla...,Bello et al.,C:/Users/89751/OneDrive/Desktop/Document2/5.pdf
5,6,Techno-economic analysis and environmental imp...,Wang et al.,C:/Users/89751/OneDrive/Desktop/Document2/6.pdf
6,7,Life-cycle sustainability of biomass-derived s...,Moreno et al.,C:/Users/89751/OneDrive/Desktop/Document2/7.pdf
7,8,Life Cycle Assessment of Biobased p-Xylene Pro...,Lin et al.,C:/Users/89751/OneDrive/Desktop/Document2/8.pdf
8,9,Techno-economic and life cycle analysis of dif...,Athaley et al.,C:/Users/89751/OneDrive/Desktop/Document2/9.pdf


In [17]:
df = pd.merge(df2, df1, on='PDF Path', how='inner')
df

Unnamed: 0,ID,Title,Author,PDF Path,System boundary,Functional unit,Impact assessment method,Impact category
0,1,Technoeconomic and Life-Cycle Assessment for E...,Patel et al.,C:/Users/89751/OneDrive/Desktop/Document2/1.pdf,Cradle to gate,100 tonne/day of FDCA,ReCiPe 1.1 hierarchist,"[Agricultural land occupation (ALO), Climate c..."
1,2,Sustainable Production of Bioplastics from Lig...,Kim et al.,C:/Users/89751/OneDrive/Desktop/Document2/2.pdf,Cradle-to-gate,1.0 kg of FDCA,ReCiPe 1.13,"[Climate change, Fossil depletion, Metal deple..."
2,2,Sustainable Production of Bioplastics from Lig...,Kim et al.,C:/Users/89751/OneDrive/Desktop/Document2/2.pdf,Cradle-to-gate,1.0 kg of TPA,ReCiPe 1.13,"[Climate change, Fossil depletion, Metal deple..."
3,3,"Integrative technical, economic, and environme...",Kim et al.,C:/Users/89751/OneDrive/Desktop/Document2/3.pdf,Cradle-to-gate,1.0 kg of FDCA,Recipe 1.13 Hierarchist,"[Climate change (CC), Fossil depletion (FD)]"
4,4,Environmental sustainability assessment of HMF...,Bello et al.,C:/Users/89751/OneDrive/Desktop/Document2/4.pdf,Cradle to gate,1 kg of FDCA/h at the factory gate,ReCiPe 1.12 hierarchist,"[Climate change (CC), Ozone depletion (OD), Te..."
5,5,Towards improving the sustainability of biopla...,Bello et al.,C:/Users/89751/OneDrive/Desktop/Document2/5.pdf,Cradle-to-gate,1 kg/h of FDCA,ReCiPe 1.1 hierarchist,"[Global warming, Ozone depletion, Ozone format..."
6,6,Techno-economic analysis and environmental imp...,Wang et al.,C:/Users/89751/OneDrive/Desktop/Document2/6.pdf,From raw material (corn) delivery to the plant...,1 kg of citric acid,TRACI 2.1,"[Global warming potential, Acidification poten..."
7,7,Life-cycle sustainability of biomass-derived s...,Moreno et al.,C:/Users/89751/OneDrive/Desktop/Document2/7.pdf,Cradle to gate,1000 kg (1 ton) of sorbitol,ReCiPe,"[Climate change (CC), Ozone depletion (OD), Te..."
8,8,Life Cycle Assessment of Biobased p-Xylene Pro...,Lin et al.,C:/Users/89751/OneDrive/Desktop/Document2/8.pdf,Cradle-to-gate,1 metric ton of pX produced,ReCiPe v1.0,"[Ozone depletion, Human toxicity, Ecotoxicity,..."
9,9,Techno-economic and life cycle analysis of dif...,Athaley et al.,C:/Users/89751/OneDrive/Desktop/Document2/9.pdf,Cradle-to-gate,1 metric ton of p-Xylene,ReCiPe midpoint method,"[Climate change, Water depletion, Land occupat..."


In [18]:
file_name = 'C:/Users/89751/OneDrive/Desktop/LCA ontology/Paper_output2.csv'
df_to_csv(df, file_name)