In [7]:
import os
import re
import json
import openai


In [9]:
def extract_from_raw_text(raw_text):
    # Define regex patterns for the fields we want to extract
    patterns = {
        "Food": r"has_food_constituent:\s*(.+?)\s*has_dose:",
        "Phenotype": r"has_phenotype:\s*(.+)",
        "Target_population": r"has_target_population:\s*(.+?)\s*has_evidence:",
        "Citations": r"has_citations:\s*(.+?)\s*has_phenotype:",
    }
    # Extract values using the defined patterns
    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, raw_text, re.DOTALL)
        if match:
            text = match.group(1).strip()
            pattern = r'[\"\'\\/\\?\\-_+=\n]'
            cleaned_text = re.sub(pattern, '', text)
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
            extracted_data[key] = cleaned_text
        else:
            extracted_data[key] = ""

    # Add DOI field with placeholder value
    extracted_data["DOI"] = "-"

    return extracted_data

def process_json_file(file_path):
    # Load the JSON data
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()

    # Extract the extracted_object section
    extracted_object_match = re.search(r"extracted_object:\n([\s\S]+)", data)
    if extracted_object_match:
        extracted_object_text = extracted_object_match.group(1)
        return extract_from_raw_text(extracted_object_text)
    else:
        print(f"No extracted_object section found in file: {file_path}")
        return None

def process_claims_folder(claims_folder_path):
    combined_data = []
    for file_name in os.listdir(claims_folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(claims_folder_path, file_name)
            extracted_data = process_json_file(file_path)
            if extracted_data:
                combined_data.append(extracted_data)

    # Save the combined data to a new JSON file in the Claims folder
    combined_file_path = os.path.join(claims_folder_path, 'combined_data.json')
    with open(combined_file_path, 'w', encoding='utf-8') as combined_file:
        json.dump(combined_data, combined_file, indent=4)
    print(f"Combined data saved to {combined_file_path}")

def process_directory(directory):
    for folder_name in os.listdir(directory):
        folder_path = os.path.join(directory, folder_name)
        if os.path.isdir(folder_path):
            claims_folder_path = os.path.join(folder_path, 'Claims')
            if os.path.exists(claims_folder_path):
                process_claims_folder(claims_folder_path)

def main():
    root_directory = 'RootDirectoryOfPatentData'  # Replace with the path to your directory
    process_directory(root_directory)

main()

Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1814/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1815/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2011_2266/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1757/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1759/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2011_2062/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1732/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1760/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2010_1758/Claims/combined_data.json
Combined data saved to /Users/AliTarik/Documents/LastAttempt/2011_2052/Claims/combined_data.json
Combined data saved to /Users/