In [1]:
pip install openai




In [25]:
print(openai.__version__)

1.30.5


In [2]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import openai
import requests
import PyPDF2
import re
import os
import requests
import pandas as pd
import tiktoken
import time
from openai import OpenAI
from io import StringIO
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import numpy as np
import ast

In [3]:
client = OpenAI(
     api_key=os.environ.get("OPENAI_API_KEY"),)

In [4]:
def count_tokens(text):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(text))
    return num_tokens

In [5]:
def count_tokens1(text):
    """Counts the tokens in the given text."""
    return len(text.split())  # A simplistic approach, adjust based on your tokenization method.

In [6]:
def get_txt_from_pdf(pdf_files,filter_ref = True, combine=True):
    """Convert pdf files to dataframe"""
    # Create an empty list to store the data
    data = []
    # Iterate over the PDF
    for pdf in pdf_files:
        # Fetch the PDF content from the pdf
        with open(pdf, 'rb') as pdf_content:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(pdf_content)
            # Iterate over all the pages in the PDF
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num] # Extract the text from the current page
                page_text = page.extract_text()
                words = page_text.split() # Split the page text into individual words
                page_text_join = ' '.join(words) # Join the words back together with a single space between each word

                if filter_ref: #filter the reference at the end
                    page_text_join = remove_ref(page_text_join)

                page_len = len(page_text_join)
                div_len = page_len // 4 # Divide the page into 4 parts
                page_parts = [page_text_join[i*div_len:(i+1)*div_len] for i in range(4)]
            
                min_tokens = 40
                for i, page_part in enumerate(page_parts):
                    if count_tokens(page_part) > min_tokens:
                        # Append the data to the list
                        data.append({
                            'file name': pdf,
                            'page number': page_num + 1,
                            'page section': i+1,
                            'content': page_part,
                            'tokens': count_tokens(page_part)
                        })
    # Create a DataFrame from the data
    df = pd.DataFrame(data)
    if combine:
        df = combine_section(df)
    return df

def remove_ref(pdf_text):
    """This function removes reference section from a given PDF text. It uses regular expressions to find the index of the words to be filtered out."""
    # Regular expression pattern for the words to be filtered out
    pattern = r'(References|REFERENCES|Acknowledgment|ACKNOWLEDGMENT)'
    match = re.search(pattern, pdf_text)

    if match:
        # If a match is found, remove everything after the match
        start_index = match.start()
        clean_text = pdf_text[:start_index].strip()
    else:
        # Define a list of regular expression patterns for references
        reference_patterns = [
            r'\[\w{1,3}\].+?\d{3,5}\.', r'\[\w{1,3}\].+?\d{3,5};', r'\(\w{1,3}\).+?\d{3,5}\.', r'\[\w{1,3}\].+?\d{3,5},',
            r'\(\w{1,3}\).+?\d{3,5},', r'\[\w{1,3}\].+?\d{3,5}', r'\w{1,3}\).+?\d{3,5}\.', r'\w{1,3}\).+?\d{3,5}',
            r'\(\w{1,3}\).+?\d{3,5}', r'^[\w\d,\.– ;)-]+$',
        ]

        # Find and remove matches with the first eight patterns
        for pattern in reference_patterns[:8]:
            matches = re.findall(pattern, pdf_text, flags=re.S)
            pdf_text = re.sub(pattern, '', pdf_text) if len(matches) > 500 and matches.count('.') < 2 and matches.count(',') < 2 and not matches[-1].isdigit() else pdf_text

        # Split the text into lines
        lines = pdf_text.split('\n')

        # Strip each line and remove matches with the last two patterns
        for i, line in enumerate(lines):
            lines[i] = line.strip()
            for pattern in reference_patterns[7:]:
                matches = re.findall(pattern, lines[i])
                lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]

        # Join the lines back together, excluding any empty lines
        clean_text = '\n'.join([line for line in lines if line])

    return clean_text

def combine_section(df):
    """Merge sections, page numbers, add up content, and tokens based on the pdf name."""
    aggregated_df = df.groupby('file name').agg({
        'content': aggregate_content,
        'tokens': aggregate_tokens
    }).reset_index()

    return aggregated_df

def aggregate_content(series):
    """Join all elements in the series with a space separator. """
    return ' '.join(series)

def aggregate_tokens(series):
    """Sum all elements in the series."""
    return series.sum()

  lines[i] = re.sub(pattern, '', lines[i]) if len(matches) > 500 and len(re.findall('\d', matches)) < 8 and len(set(matches)) > 10 and matches.count(',') < 2 and len(matches) > 20 else lines[i]


In [7]:
def read_pdf(file_path):
    """read pdf files from a specific directory"""
    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            text += page.extract_text() if page.extract_text() else ""
    return text

In [8]:
def combine_SI(df):
    """Combine the SI document with the main part"""
    df['normalized_name'] = df['file name'].str.replace('_SI', '', regex=False)
    
    grouped_df = df.groupby('normalized_name').agg({
        'content': ' '.join,  # Concatenates content
        'tokens': 'sum'       # Sums up the tokens
    }).reset_index()

    # Rename the columns
    grouped_df.rename(columns={'normalized_name': 'file name'}, inplace=True)
    
    return grouped_df

In [9]:
def extract_title_from_path(file_path):
    """Extract the title from the file name"""
    file_name_with_extension = file_path.split('/')[-1]
    # Remove the file extension
    title = file_name_with_extension.replace('.pdf', '')
    return title

In [10]:
def df_to_csv(df, file_name):
    """Write a DataFrame to a CSV file."""
    df.to_csv(file_name, index=False, escapechar='\\')

In [11]:
def csv_to_df(file_name):
    """Read a CSV file into a DataFrame."""
    return pd.read_csv(file_name)

In [12]:
def split_content(input_string, max_tokens):
    """Splits a string into chunks based on a maximum token count."""
    split_strings = []
    current_string = ""
    tokens_so_far = 0

    for word in input_string.split():
        if tokens_so_far + count_tokens(word) > max_tokens:
            # Add the current chunk to the split_strings and reset
            split_strings.append(current_string.strip())
            current_string = word  # Start new chunk with the current word
            tokens_so_far = count_tokens1(word)  # Reset token count
        else:
            current_string += (" " if current_string else "") + word
            tokens_so_far += count_tokens(word)

    if current_string:  # Add the remaining part if any
        split_strings.append(current_string.strip())

    return split_strings

In [13]:
def chunk_documents(df, max_tokens=10000):
    """Chunk the documents in the DataFrame based on token limits."""
    chunks = []
    
    for index, row in df.iterrows():
        document_title = row['title']  # Assuming 'title' column exists in the DataFrame.
        content = row['content']
        token_count = row.get('tokens', count_tokens(content))  # Default to count if not provided.
        
        if token_count > max_tokens:
            parts = split_content(content, max_tokens)
            for part_number, part in enumerate(parts, start=1):  # Start enumeration at 1.
                chunk_content = part
                token_count = count_tokens(part)
                chunks.append({
                    'title': f"{document_title}_part_{part_number}",
                    'content': chunk_content,
                    'tokens': token_count
                })
        else:
            chunks.append({
                'title': document_title,
                'content': content,
                'tokens': token_count
            })

    return pd.DataFrame(chunks)

In [14]:
def model_1(df):
    """Model 1 is to determine whether inventory data exists and their location in each article"""

    response_msgs = []
      
    for _, row in df.iterrows():
        title = row[df.columns[0]]
        context = row['content']
        
        
        system_msg = """
        Determine whether the provided context includes life cycle inventory data (or input-output data) related to LCA investigations. 
        """

        user_msg = """
        Context: 
        The main inputs and outputs of each subunit are listed in Table 4.
        Answer: Yes, in Table 4.

        Context: 
        In Table 3, Table 4 there is a summary of input and output data, respectively, for the whole methanol production process. These data refers to 1 kg of methanol produced (functional unit).
        Answer: Yes, in Table 3 and Table 4.

        Context: 
        For the researched route, the basic input and output energy consumption and specific pollution discharge data in the specific process are collected, such as raw material consumption, life cycle energy consumption, direct pollution discharge, etc. Taking the production of 1 ton of EG as the benchmark, Tables 2–4 list the energy consumption and pollutant emissions data at each stage of the life cycle of CtEG, OtEG, and BtEG routes.
        Answer: Yes, in Tables 2-4.

        Context: 
        Table S9 The inventory data for PV/CCU-CH3OH technical route.
        Answer: Yes, in Table S9.

        Context: 
        Information for biotechnologies for biomass to formic acid and methanol production are contained in Tables B1 – B3. 
        Answer: Yes, in Tables B1-B3.

        Context: 
        The inventories include wire for baling, diesel, and electricity at varying rates depending upon the waste stream entering the MRF (e.g. single stream, pre-sorted, mixed waste, dual stream) and the type of polymer being sorted (Table S2)
        Answer: Yes, in Table S2.
        
        Context:
        Table S1 details the input-output data of the four ethylene glycol production routes.
        Answer: Yes, in Table S1.
        
        Context: 
        The plant-level mass and energy balances for manufacturing ethylene via the three pathways on the ethylene production scale of 1000 kt/yr are summarized in Table 1.
        Answer: Yes, in Table 1.
        """

        attempts = 3
        while attempts > 0:
            question = """Determine whether the provided context includes life cycle inventory data (or input-output data) of LCA, 
        and answer with either "Yes" or "No" only.  If 'Yes,' specify the table number(s) where this data can be found (e.g., in Table 2). 
        If the data cannot be located, respond with 'cannot be located'. 
        """
            try:
                response = openai.chat.completions.create(
                    messages=[
                        {
                            "role": "system", 
                            "content": system_msg
                        },
                        {
                            "role": "user",
                            "content": user_msg + context + question
                        }
                    ],
                    model='gpt-3.5-turbo')
                
                answers = response.choices[0].message.content
                break

            except Exception as e:
                attempts -= 1
                if attempts > 0:
                    print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)")
                    time.sleep(60)
                else:
                    print(f"Error: Failed to process paper {title}. Skipping. (model 1)")
                    answers = "No"
                    break

        response_msgs.append(answers)
    df['results'] = response_msgs
    return df

In [15]:
def model_2(df):
    """Model 2 is to summarize the biorefinery configurations"""

    response_msgs = []
      
    for _, row in df.iterrows():
        title = row[df.columns[0]]
        context = row['content']
    
        attempts = 3
        while attempts > 0:
            question = """Answer the questions as truthfully as possible using the provided context.
            please summarize the main biorefinery configurations, such as temperatures, pressures, yield, and relative equipments, for each step"""
            try:
                response = openai.chat.completions.create(
                    messages=[
                        {
                            "role": "user",
                            "content": context + question
                        }
                    ],
                    model='gpt-3.5-turbo')
                
                answers = response.choices[0].message.content
                break

            except Exception as e:
                attempts -= 1
                if attempts > 0:
                    print(f"Error: {str(e)}. Retrying in 60 seconds. {attempts} attempts remaining. (model 1)")
                    time.sleep(60)
                else:
                    print(f"Error: Failed to process paper {title}. Skipping. (model 1)")
                    answers = "No"
                    break

        response_msgs.append(answers)
    df['results1'] = response_msgs
    return df

In [75]:
#read all pdf files from a specific directory
directory = 'C:/Users/89751/OneDrive/desktop/GNN/Text_mining/Document/'
files = os.listdir(directory)
pdf_files = [os.path.join(directory, file) for file in files]  #
pdf_files

['C:/Users/89751/OneDrive/desktop/GNN/Text_mining/Document/Techno-economic and environmental assessments for sustainable bio-methanol production as landfill gas valorization.pdf']

In [76]:
#extract the content for each article
df = get_txt_from_pdf(pdf_files)
df
#print(df['content'][0])

Unnamed: 0,file name,content,tokens
0,C:/Users/89751/OneDrive/desktop/GNN/Text_minin...,Waste Management 150 (2022) 90–97 Available on...,10171


In [77]:
#combine the main part with the supplementary document for each article
df = combine_SI(df)
df

Unnamed: 0,file name,content,tokens
0,C:/Users/89751/OneDrive/desktop/GNN/Text_minin...,Waste Management 150 (2022) 90–97 Available on...,10171


In [78]:
#replace file names with the article titles
df['file name'] = df['file name'].apply(extract_title_from_path)
df = df.rename(columns={'file name': 'title'})
df

Unnamed: 0,title,content,tokens
0,Techno-economic and environmental assessments ...,Waste Management 150 (2022) 90–97 Available on...,10171


In [79]:
#chunk the document if tokens > 10000
df1 = chunk_documents(df)
df1

Unnamed: 0,title,content,tokens
0,Techno-economic and environmental assessments ...,Waste Management 150 (2022) 90–97 Available on...,9699
1,Techno-economic and environmental assessments ...,"Farkhondehfal, M.A., Sastre, F., Makkee, M., S...",460


In [80]:
#get results from the LLM about whether inventory data exists and locations
result_df = model_1(df1)
result_df

Unnamed: 0,title,content,tokens,results
0,Techno-economic and environmental assessments ...,Waste Management 150 (2022) 90–97 Available on...,9699,"Yes, in Tables 2-4."
1,Techno-economic and environmental assessments ...,"Farkhondehfal, M.A., Sastre, F., Makkee, M., S...",460,"Yes, in Tables S2, S9, B1-B3, 1, 3, and 4."


In [74]:
print(df1['results'][0])

Yes, in the provided context, life cycle inventory data related to LCA investigations can be found.


In [52]:
result_df = model_2(df1)
result_df

Unnamed: 0,title,content,tokens,results,results1
0,LCA comparison analysis for two types of H2 ca...,RESEARCH ARTICLE LCA comparison analysis for t...,9378,"Yes, in Table 1, Table 2.",The main biorefinery configurations include di...
1,LCA comparison analysis for two types of H2 ca...,"Zhejiang University, Wiley Online Library on [...",6429,"Yes, in Tables S1, S7, S8, S9, and S10.",The main biorefinery configurations for the pr...


In [55]:
print(df1['results1'][0])

The main biorefinery configurations include different temperatures, pressures, yields, and relative equipment for each step of the process. The steps involved in biorefinery processes typically include feedstock preparation, pretreatment, enzymatic hydrolysis, fermentation, and product recovery.

1. Feedstock Preparation: The feedstock, such as biomass, is prepared for further processing by grinding, milling, or chopping to reduce particle size and increase surface area for better extraction and conversion efficiency.

2. Pretreatment: In the pretreatment step, the feedstock is treated with heat, chemicals, or enzymes to break down complex structures like lignin and hemicellulose, making the cellulose more accessible for enzymatic hydrolysis. Pretreatment conditions may include temperatures ranging from 120-200°C and pressures of 5-30 bar.

3. Enzymatic Hydrolysis: Enzymatic hydrolysis involves the use of enzymes to break down cellulose into fermentable sugars. The process operates at 

In [34]:
#export title and results as csv. file
df_selected = result_df[['title', 'results', 'results1']]
file_name = 'C:/Users/89751/OneDrive/desktop/test.csv'
df_to_csv(df_selected, file_name)