In [2]:
import re
import os
import json
import fitz

def pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    coordinates = (10, 75, 550, 750)
    for page_number in range(doc.page_count): 
        page = doc.load_page(page_number)
        text = page.get_textbox(coordinates) 
        if text: 
            all_text.append(text)
    
    doc.close()
    return "\n".join(all_text)

def extract_sections(text):
    # Extract from ASSESSMENT to APPENDICES
    assessment_to_appendices_pattern = re.compile(r'INFORMATION AS PROVIDED IN THE CONSOLIDATED LIST(.*?APPENDICES)', re.DOTALL)
    assessment_to_appendices_match = assessment_to_appendices_pattern.search(text)
    if not assessment_to_appendices_match:
        return None, None, None, None
    assessment_to_appendices = assessment_to_appendices_match.group(1)

    # Check for the existence of "Scientific substantiation of the claimed effect"
    sci_substantiation_pattern = re.compile(r'(Scientific substantiation of the claimed effect.*?)(comments on the proposed wording|Conditions and possible restrictions of use|CONCLUSIONS)', re.DOTALL)
    sci_substantiation_match = sci_substantiation_pattern.search(assessment_to_appendices)
    if not sci_substantiation_match:
        return None, None, None, None
    scientific_subs = sci_substantiation_match.group(1)

    # Check if "Conditions and possible restrictions of use" exists
    conditions_restrictions_pattern = re.compile(r'(Conditions and possible restrictions of use.*?)CONCLUSIONS', re.DOTALL)
    conditions_restrictions_match = conditions_restrictions_pattern.search(assessment_to_appendices)

    if conditions_restrictions_match:
        conditions_restrictions = conditions_restrictions_match.group(1)
    else:
        conditions_restrictions = "None"
    conclusions_pattern = re.compile(r'(CONCLUSIONS.*?)DOCUMENTATION PROVIDED TO EFSA', re.DOTALL)
    conclusions_match = conclusions_pattern.search(assessment_to_appendices)
    if conclusions_match:
        conclusions = conclusions_match.group(1)
    else: return None, None, None, None

    references_pattern = re.compile(r'(REFERENCES.*?)APPENDICES', re.DOTALL)
    references_match = references_pattern.search(assessment_to_appendices)
    references = references_match.group(1) if references_match else ""

    return scientific_subs, conditions_restrictions, conclusions, references

def save_to_json(data, output_directory, filename):
    os.makedirs(output_directory, exist_ok=True)
    file_path = os.path.join(output_directory, filename)
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def process_document(pdf_path):
    output_dir = os.path.dirname(pdf_path) 
    output_filename = os.path.splitext(os.path.basename(pdf_path))[0] + '.json'

    text = pdf_text(pdf_path)
    scientific_subs, conditions_restrictions, conclusions, references = extract_sections(text)

    if scientific_subs is None:
        print("Scientific substantiation section not found.")
        return

    data = {
        "scientific_substantiation": scientific_subs,
        "conditions_restrictions": conditions_restrictions,
        "conclusions": conclusions,
        "references": references
    }
    save_to_json(data, output_dir, output_filename)

def preprocess_all_pdfs(input_dir):
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                print(f"Processing {pdf_path}...")
                process_document(pdf_path)

def main():
    input_dir = 'RootDirectoryOfPatentData'
    preprocess_all_pdfs(input_dir)

main()

Processing /Users/AliTarik/Documents/LastAttempt/2010_1814/2010_1814.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2010_1815/2010_1815.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2011_2266/2011_2266.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2010_1757/2010_1757.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2010_1759/2010_1759.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2011_2062/2011_2062.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2011_2203/2011_2203.pdf...
Scientific substantiation section not found.
Processing /Users/AliTarik/Documents/LastAttempt/2010_1732/2010_1732.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2010_1760/2010_1760.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2010_1758/2010_1758.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2011_2052/2011_2052.pdf...
Processing /Users/AliTarik/Documents/LastAttempt/2011_2258/2011_2258.pdf...
Processing /Users/AliTarik/Documents/LastAt

In [3]:
def clean_text(text):
    clean_text = text.replace('\n', '')

    clean_text = text.replace('\\', '')
    # return re.sub(r'\\[\n\']', '', text)
    return clean_text


def extract_sections_with_ids(text):
    pattern = r'([^\n]*\(ID [^\)]+\))(.*?)(?=\n[^\n]*\(ID |$)'
    matches = re.findall(pattern, text, re.DOTALL)
    if not matches:
        return [["One Claim" , text]]
    return matches

def create_dictionary(matches):
    # Split the match string into lines
    dictionary = {}
    for title_id, content in matches:
        try:
            title_id = clean_text(title_id.strip())
            content = clean_text(content.strip())
        except:
            title_id = clean_text(title_id[0])
            content = clean_text(content[0])

        dictionary[title_id] = title_id + "  " + content
    
    return dictionary
def save_output_to_file(directory,key, scientific_text, conditions_text, conclusion_text):
    claims_directory = os.path.join(directory, 'claims')
    os.makedirs(claims_directory, exist_ok=True)
    filename = os.path.join(claims_directory, f"{key.replace(' ', '_').replace('/', '_').replace(':', '_')}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"Scientific Substantiation:\n{scientific_text}\n\n")
        file.write(f"Conditions and Restrictions:\n{conditions_text}\n\n")
        file.write(f"Conclusions:\n{conclusion_text}\n\n")

def process_folder(folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json') and not file_name.endswith('data.json'):
            print(f"Processing {file_name}...")
            json_file_path = os.path.join(folder_path, file_name)
            data = load_text_from_json(json_file_path)
  


            output_scientific = create_dictionary(extract_sections_with_ids(data["scientific_substantiation"]))
            output_conclusion = create_dictionary(extract_sections_with_ids(data["conclusions"]))
            for key in output_scientific.keys():
                scientific_text = output_scientific[key]
                if len(output_conclusion) == 1:
                    conclusion_text = output_conclusion
                else:
                    conclusion_text = output_conclusion.get(key, "No conclusion data available")
                conditions_text = data['conditions_restrictions']
                save_output_to_file(folder_path, key, scientific_text, conditions_text, conclusion_text)

def load_text_from_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data  

def main():
    root_directory = 'RootDirectoryOfPatentData'
    for subdir in next(os.walk(root_directory))[1]:
        process_folder(os.path.join(root_directory, subdir))
main()

Processing 2010_1814.json...
Processing 2010_1815.json...
Processing 2011_2266.json...
Processing 2010_1757.json...
Processing 2010_1759.json...
Processing 2011_2062.json...
Processing 2010_1732.json...
Processing 2010_1760.json...
Processing 2010_1758.json...
Processing 2011_2052.json...
Processing 2011_2258.json...
Processing 2010_1756.json...
Processing 2011_2205.json...
Processing 2010_1734.json...
Processing 2011_2229.json...
Processing 2010_1727.json...
Processing 2011_2024.json...
Processing 2011_2211.json...
Processing 2011_2079.json...
Processing 2010_1728.json...
Processing 2011_2226.json...
Processing 2011_2040.json...
Processing 2011_2078.json...
Processing 2011_2071.json...
Processing 2011_2076.json...
Processing 2009_1272.json...
Processing 2009_1210.json...
Processing 2009_1217.json...
Processing 2010_1808.json...
Processing 2009_1228.json...
Processing 2009_1221.json...
Processing 2010_1466.json...
Processing 2009_1226.json...
Processing 2010_1806.json...
Processing 200