In [48]:
import pandas as pd
from tabula import read_pdf
import fitz  # PyMuPDF
import re

In [49]:
pdf_path = 'new_data/multiple_test.pdf'  # Adjust this to the correct path

#### Extracting table with the specific information

In [50]:
def extract_specific_columns(pdf_path):
    # Attempting to extract tables using tabula-py
    try:
        tables = read_pdf(pdf_path, pages='all', multiple_tables=True, stream=True)
        if tables:
            # Assuming you are interested in the first table
            table = tables[0]
            if 'Parameter' in table.columns and 'Resultat' in table.columns:
                return table[['Parameter', 'Resultat']]
            else:
                print("The specified columns were not found in the table.")
                return None
        else:
            print("No tables found in the PDF.")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [52]:
df_info_specific_columns = extract_specific_columns(pdf_path)
if df_info_specific_columns is not None:
    print(df_info_specific_columns.head(12))

Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile
Aug 23, 2024 1:13:26 PM org.apache.pdfbox.pdmodel.graphics.color.PDICCBased ensureDisplayProfile


                    Parameter Resultat
0                 Tørstof, TS       85
1         Kulbrinter C6H6-C10       <2
2         Kulbrinter >C10-C15       <5
3         Kulbrinter >C15-C20       <5
4         Kulbrinter >C20-C35      <20
5   Totalkulbrinter, sum af 4        #
6               Benzo(a)pyren   0,0072
7        Dibenz(a,h)anthracen   <0,005
8         Sum af PAH (7 stk.)    0,057
9                         Bly       20
10                    Cadmium     0,39
11               Chrom, Total       29


#### Turn it around

In [30]:
def transpose_table_clean(df):
    # Check if 'Parameter' column exists to set it as the index
    if 'Parameter' in df.columns:
        # Set the 'Parameter' column as the index and transpose the DataFrame
        df_transposed = df.set_index('Parameter').T
        
        # Remove the name of the index and columns
        df_transposed.index.name = None
        df_transposed.columns.name = None
        
        # Print the cleaned transposed DataFrame without resetting the index
        print("Cleaned Transposed Table without resetting the index:")
        print(df_transposed.to_string(index=False))  # Print without showing the index column
        
        # Optionally, return the cleaned transposed DataFrame for further use
        return df_transposed
    else:
        print("The 'Parameter' column was not found.")
        return None




In [32]:
if transposed_cleaned_table is not None:
    print(transposed_cleaned_table.to_string(index=False))


Tørstof, TS Kulbrinter C6H6-C10 Kulbrinter >C10-C15 Kulbrinter >C15-C20 Kulbrinter >C20-C35 Totalkulbrinter, sum af 4 Benzo(a)pyren Dibenz(a,h)anthracen Sum af PAH (7 stk.) Bly Cadmium Chrom, Total Kobber Nikkel Zink
         85                  <2                  <5                  <5                 <20                         #        0,0072               <0,005               0,057  20    0,39           29     15     19   54


In [43]:
def extract_text_info(pdf_path):
    # Open the PDF and extract text
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    
    # Define the keys to search for in the text
    info_keys = ['Labnr.','Prøver modtaget den','Analyse påbegyndt den', 'Dybde', 'Rekvirent prøve ID']
    
    # Extract the values based on the keys
    info_values = [re.search(f"{key}:\s*(.*)", text).group(1) for key in info_keys if re.search(f"{key}:\s*(.*)", text)]
    
    # Create a dictionary from the extracted values
    extracted_info = dict(zip(info_keys, info_values))
    
    # Convert the dictionary to a DataFrame
    df_info = pd.DataFrame([extracted_info])
    
    # Optionally, return the DataFrame for further use
    return df_info


In [44]:
df_info = extract_text_info(pdf_path)
if df_info is not None:
    print(df_info.to_string(index=False))


        Labnr. Prøver modtaget den Analyse påbegyndt den Dybde Rekvirent prøve ID
JO23250145-001          20-06-2023            21-06-2023 0-0,5                M18


#### combine dataframes

In [45]:


if df_transposed is not None and df_info is not None:
    # Resetting the index to avoid any potential issues with index misalignment
    df_transposed_reset = df_transposed.reset_index(drop=True)
    df_info_reset = df_info.reset_index(drop=True)

    # Concatenating the two DataFrames horizontally (axis=1)
    combined_df = pd.concat([df_info_reset, df_transposed_reset], axis=1)
else:
    print("One of the DataFrames is not defined. Please check the extraction steps.")


In [46]:
# Output the combined DataFrame
if 'combined_df' in locals():
    print(combined_df.to_string(index=False))
else:
    print("The combined DataFrame is not available.")


        Labnr. Prøver modtaget den Analyse påbegyndt den Dybde Rekvirent prøve ID Tørstof, TS Kulbrinter C6H6-C10 Kulbrinter >C10-C15 Kulbrinter >C15-C20 Kulbrinter >C20-C35 Totalkulbrinter, sum af 4 Benzo(a)pyren Dibenz(a,h)anthracen Sum af PAH (7 stk.) Bly Cadmium Chrom, Total Kobber Nikkel Zink
JO23250145-001          20-06-2023            21-06-2023 0-0,5                M18          85                  <2                  <5                  <5                 <20                         #        0,0072               <0,005               0,057  20    0,39           29     15     19   54
