In [None]:
import os
import pandas as pd
import json
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate

In [None]:
#set API Key from OpenAI
openai_api_key= "Add Your OpenAI API KEY Here."

In [None]:
def LCA_information_extraction(document_id, persist_directory):
    """Extract the system boundary, functional unit, target product, LCIA methods, impact category, LCIA results, and geography from the given paper"""
    
    #create a Chroma vector store, specifying the persistence directory
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

    docstorage = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    k = docstorage._collection.count()
    retriever = docstorage.as_retriever(search_kwargs={"filter": {"document_id": document_id}, "k": k})
    
    #initial the LLM model
    llm = ChatOpenAI(
        model_name="gpt-4o",
        temperature=0,
        max_tokens=4000,
        openai_api_key=openai_api_key
    )
    
    system_prompt = (
        """
        You are a helpful assistant specializing in extracting life cycle assessment (LCA) data. 
        Based on the provided context, use the definitions below to answer the question at the end. 
        If you are unsure of the answer, state "I don't know" rather than conjecture.
        """
    )
    
    context = """
        **System boundary**: is defined as the scope of the LCA, which specifies the life cycle stages and processes that are considered in LCA studies. Common system boundaries are cradle-to-gate, gate-to-gate, and cradle-to-grave:
            - Cradle-to-gate: from the raw material extraction (cradle) up to the factory gate, includes raw material extraction, transport, and manufacturing.
            - Cradle-to-grave: covers the entire product lifecycle from raw material extraction (cradle) to the final disposal (grave). 
            - Gate-to-gate: only includes the production stage.
        **Functional unit (FU)**: is a quantified description of the performance of a product. The purpose of the functional unit is to provide a reference to which the inputs and outputs can be related.
        **Reference product**: is the main output of a product system that delivers the function described by the functional unit. 
        **Life Cycle Impact Assessment (LCIA) method**: is defined as the method to quantify the environmental impacts of a product system. It analyzes the data from the inventory phase and translates it into meaningful impact categories. The common LCIA method includes IPCC, ReCiPe, TRACI, CML, EF and Impact 2002+.
        **Impact category**: is a set of categories that represent a different type of potential environmental impact of a product system. These impact categories are selected based on the chosen impact assessment method. The common impact categories include global warming potentials, ozone depletion, eutrophication, acidification, human toxicity, and photochemical ozone formation. 
        **Life cycle assessment (LCIA) results**: a quantitative environment impacts of a product. They can be presented as a specific value or a range, and are typically with units according to selected impact categories, such as global warming potential (kg CO₂-eq), acidification potential (kg SO₂-eq), and eutrophication potential (kg PO₄³⁻-eq).
        **Geography**: refers to the specific physical or regional context (e.g., country, continent, or specific site) in which the LCA study is conducted or modeled. 
        
        ### Instructions for Data Extraction: 
        - Identify and list all information with minimum detail to understand their relationships. 
        - Is really IMPORTANT to extract all data from the given article and ensure completeness and precision in your extraction. 
        - Extract all impact categories from the article, such as global warming potentials, ozone depletion, eutrophication, acidification, human toxicity, and photochemical ozone formation.
        - Extract only LCIA results related to the impact category of global warming potential (GWP), climate change (CC), or GHG emissions, which use CO₂-eq as units. Represent LCIA results in the format: ["LCIA results": "Reference products/Scenarios/Cases: Impact values + Units"].
        - Avoid extra explanations and format your response as a structured JSON format as follows:
          [{{"System boundary": "", "Functional unit": "", "Impact assessment method": "", "Impact category": [], "LCIA results": [], "Geography": ""}}]
        - If you cannot find information about the system boundary, functional unit, reference product, impact assessment method, LCIA results, or geography, label them as "Not mentioned."
    
        ###Examples:
        1. Context: <The "cradle to grave" is defined as the system boundary of all the six comparison scenarios, which covers the material and energy production chain and all processes from the raw material extraction through the production, transportation, and use phase up to the product's end of life treatment. In this study, the functional unit is defined as 1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables. The socalled CML 2001 method is applied to LCIA calculation. 
        LCA results and impact analysis\n\n\nGHG emissions\nProduction phase (This study) Reference\n\nCoal-CH 3 OH (kg 3.09 2.6 to 3.8 [39]\nCO 2 -eq/kg)\n\nNG-CH 3 OH 0.84 0.873 to 0.881 [40]\n(kg CO 2 -eq/kg)\n\n\nPV/CCU-CH 3 OH 1.04 0.99 [41]\n(kg CO 2 -eq/kg)\n\nCoal-NH 3 3.93 3.85 [22]\n(kg CO 2 -eq/kg)\n\nNG-NH 3 2.70 2.74 [13]\n(kg CO 2 -eq/kg)\n\nPV-NH 3 0.78 0.93 [42]\n(kg CO 2 -eq/kg)
        This work is based on the nation conditions of China.>
        Answer: 
            [{{"System boundary": "Cradle-to-grave",
              "Functional unit": "1 kg and 1 MJ of H2 carrier produced from coal, natural gas, and renewables",
              "Reference product": "Methanol and Ammonia",
              "Impact assessment method": "CML 2001",
              "Impact category": [
                "Global warming potential (GWP)",
                "Acidification potential",
                "Ozone depletion potential",
                "Photochemical oxidant creation potential",
                "Eutrophication potential",
                "Abiotic depletion potential"],
              "LCIA results": [
                "Coal-CH3OH: 3.09 kg CO2-eq/kg",
                "NG-CH3OH: 0.84 kg CO2-eq/kg",
                "PV/CCU-CH3OH: 1.04 kg CO2-eq/kg",
                "Coal-NH3: 3.93 kg CO2-eq/kg",
                "NG-NH3: 2.70 kg CO2-eq/kg",
                "PV/CCU-NH3: 0.78 kg CO2-eq/kg"],
                "Geography": "China"}}]
            
        2. Context: <As indicated in Fig. 1, the scope of the study is from ‘cradle to gate’, with two main stages considered: biomass supply (cultivation, collection and transportation to the processing plant); and production of bio-ethylene and its co-products. The functional unit is defined as the production of 1 tonne of ethylene. The SimaPro v.8.3. software (Pré Consultants B.V., 2017) has been used for the life cycle modelling and the impacts have been calculated following the CML 2 method (Guinée et al., 2001), using the April 2016 update.
        Case 3 is the best option\nwith the negative net values for these three categories: -62.4 GJ/t\n(ADP fossil ), -0.07 t CO 2 eq./kg (GWP), and -59 mg CFC-11 eq./t\n(ODP).
        The production plant is assumed to be based in the Duero Valley (Castilla y Leon, Spain) as there is extensive cultivation of poplar there due to favourable climatic conditions.>
        Answer:
        [{{"System boundary": "Cradle-to-gate",
            "Functional unit": "1 tonne of ethylene",
            "Reference product": "Ethylene",
            "Impact assessment method": "CML 2",
            "Impact category": [
              "Global warming potential (GWP)"],
            "LCIA results": [
              "Case 3: -0.07 t CO2-eq./kg (GWP)"],
            "Geography": "Duero Valley (Castilla y Leon, Spain)"}}]
        3. Context: <The LCA model has a cradle-to-gate scope, and the system boundary includes sugarcane farming, bagasse transportation, size reduction, pretreatment, enzymatic hydrolysis, fermentation, and downstream separation, as shown in Fig. 2. 1 kg of lactic acid is the functional unit. OpenLCA 1.9 is used to create a product system. ReCiPe 2016 methodology with the hierarchist perspective, commonly used in LCA literature (Hiloidhari et al., 2020), is implemented.
        The total life cycle climate change impact for production of 1 kg of lactic acid was 4.62 kg CO 2 eq.
        The goal of the LCA is to quantify the environmental impacts of bagasse based LA production facility annexed with an Indian sugar mill.>
        Answer:
        [{{"System boundary": "Cradle-to-gate",
            "Functional unit": "1 kg of lactic acid",
            "Reference product": "Lactic acid",
            "Impact assessment method": "ReCiPe 2016",
            "Impact category": [
              "Global warming potential (GWP)"],
            "LCIA results": [
              "Lactic acid: 4.62 kg CO2-eq./kg"],
            "Geography": "India"}}]
    """
    
    # Create a prompt template
    prompt_template = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", context),
            ("user", "Response to the question: {question}")
        ]
    )
    
    chain_type_kwargs = {
        "prompt": prompt_template,
        "document_variable_name": "question"
    }
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm, 
        retriever=retriever, 
        chain_type_kwargs=chain_type_kwargs
    )
    
    question = "Extract the system boundary, functional unit, reference product, impact assessment method, impact category, LCIA results, or geography for me."
    result = qa_chain({"query": question})

    content = result.get("result", "").strip()

    # Remove triple backticks and optional 'json' specifier
    if content.startswith("```"):
        content = content.replace("```json", "").replace("```", "").strip()

    # Parse to Python object
    try:
        data_list = json.loads(content)
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}\nRaw content: {content}")

    return data_list

def process_document(df_vectordb):
    """Processing each paper for data extraction"""

    results = []

    for _, row in df_vectordb.iterrows():  # Iterate over all_files
        document_id = row["Document_ID"]
        persist_directory = row["Vectordb_path"]
        PDF_path = row["PDF_path"]
        
        try:

            # Extract information
            LCA_data = LCA_information_extraction(document_id, persist_directory)

            # Add metadata columns
            for entry in LCA_data:
                entry["Author"] = row["Author"]  # Add Author from df_vectordb
                entry["Paper_title"] = row["Paper_title"]  # Add Paper title
                entry["PDF_path"] = row["PDF_path"]  # Add PDF path
            
            results.append(LCA_data)
            print(f"Successfully processed document: {PDF_path}")

        except Exception as e:
            print(f"Error processing processed document: {PDF_path}: {e}")
            continue  # Skip to the next document in case of an error

    # Flatten nested lists
    all_result = [item for sublist in results for item in sublist]

    return all_result

def csv_to_df(file_name):
    """Read a CSV file into a DataFrame."""
    return pd.read_csv(file_name)

def df_to_csv(df, file_name):
    """Write a DataFrame to a CSV file"""
    df.to_csv(file_name, index=False, escapechar='\\')

In [None]:
# Input path to the vector DB index CSV generated in Step 1
file_path = "Add your output CSV path here (from Step 1) "
df_vectordb = pd.read_csv(file_path, encoding='ISO-8859-1')
df_vectordb

In [None]:
# process each paper for data extraction
all_results = process_document(df_vectordb)

In [None]:
# Convert extracted results into a DataFrame
df_LCA = pd.DataFrame(all_results)
df_LCA

In [None]:
# Output file path for saving the extracted results
file_name = "Add your output CSV path here" 
df_to_csv(df_LCA, file_name)