In [24]:
import re
import json
import pandas as pd

def read_data_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)
    
def convert_to_bio(data):
    text = data.get("value", "")
    labels = data.get("metrics", [])
    
    # Define a regex pattern to capture numbers with commas, decimal numbers, units, and words
    tokens = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text)
    
    # Filter out unwanted punctuation tokens
    tokens = [token for token in tokens if token not in ['.', '/']]

    bio_labels = ['O'] * len(tokens)

    for entry in labels:
        start = entry['start']
        end = entry['end']
        label_type = entry['labels'][0]

        # Validate start and end indices
        if start < 0 or end > len(text):
            print(f"Skipping entry with invalid indices: {entry}")
            continue
        
        # Calculate the start token index
        start_token_index = len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text[:start]))
        
        # Validate start_token_index
        if start_token_index < 0 or start_token_index >= len(tokens):
            print(f"Invalid start token index: {start_token_index} for text: {text}")
            continue
        
        bio_labels[start_token_index] = f'B-{label_type}'

        # Process remaining tokens for the current label
        for i in range(1, len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', entry['text']))):
            if start_token_index + i >= len(bio_labels):
                print(f"Index out of bounds while assigning I-{label_type} for index: {start_token_index + i}")
                break  # Exit loop if out of bounds
            
            bio_labels[start_token_index + i] = f'I-{label_type}'

    return list(zip(tokens, bio_labels))

# Process each JSON object and convert to BIO format
def process_json_file(filename):
    data_entries = read_data_from_json(filename)
    
    all_bio_formats = []
    
    for index, data in enumerate(data_entries):
        print(f"Processing entry {index}: {data}")  # Debug output
        if isinstance(data, dict):
            print(f"Keys found: {list(data.keys())}")  # Show the keys for each entry
            
            # Use the correct keys based on your findings
            if 'value' in data and 'metrics' in data:
                bio_format = convert_to_bio(data)
                all_bio_formats.extend(bio_format)
            else:
                print(f"Skipping entry due to missing 'value' or 'metrics': {data}")

    if not all_bio_formats:
        print("No valid entries found for BIO conversion.")

    return all_bio_formats

# Read and process the data from the JSON file
filename = '/Users/roselynnnn/Downloads/project-5 copy.json'
bio_format = process_json_file(filename)

# Save the BIO format to CSV and text files
def save_to_csv(bio_format, filename):
    df = pd.DataFrame(bio_format, columns=['Token', 'BIO Label'])
    df.to_csv(filename, index=False)

def save_to_text(bio_format, filename):
    with open(filename, 'w') as f:
        for token, label in bio_format:
            f.write(f"{token}\t{label}\n")

# Save to files
save_to_csv(bio_format, 'bio_output_6.csv')
save_to_text(bio_format, 'bio_output_6.txt')

print("BIO format saved to bio_output_6.csv and bio_output_6.txt.")


Processing entry 0: {'value': 'The SSC is led by the Chief have not sought external assurance on this report but may Executive Officer and Executive Director, and is responsible consider doing so in the future.', 'id': 5847, 'metrics': [{'start': 44, 'end': 62, 'text': 'external assurance', 'labels': ['metric']}, {'start': 33, 'end': 36, 'text': 'not', 'labels': ['value']}], 'annotator': 1, 'annotation_id': 49, 'created_at': '2024-11-06T18:56:45.247110Z', 'updated_at': '2024-11-06T18:56:45.247110Z', 'lead_time': 12.429}
Keys found: ['value', 'id', 'metrics', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time']
Processing entry 1: {'value': 'weforum.org/agenda/2022/10/cop27-how-healthcare-can-reduce-carbon-footprint/ ANNUAL REPORT 2023 55 SUSTAINABILITY REPORT The breakdown of the energy consumption for OUEH s businesses are as follows: COUNTRY SINGAPORE CHINA OUEH Singapore 12 Clinics Pharmaceutical Business Xi Nan Hospital Corporate Office under O2HG Business Office 

In [25]:
import re
import json
import pandas as pd

def read_data_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)
    
def convert_to_bio(data):
    text = data.get("value", "")
    labels = data.get("metrics", [])
    
    # Define a regex pattern to capture numbers with commas, decimal numbers, units, and words
    tokens = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text)
    
    # Filter out unwanted punctuation tokens
    tokens = [token for token in tokens if token not in ['.', '/']]

    bio_labels = ['O'] * len(tokens)

    for entry in labels:
        start = entry['start']
        end = entry['end']
        label_type = entry['labels'][0]

        # Validate start and end indices
        if start < 0 or end > len(text):
            print(f"Skipping entry with invalid indices: {entry}")
            continue
        
        # Calculate the start token index
        start_token_index = len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text[:start]))
        
        # Validate start_token_index
        if start_token_index < 0 or start_token_index >= len(tokens):
            print(f"Invalid start token index: {start_token_index} for text: {text}")
            continue
        
        bio_labels[start_token_index] = f'B-{label_type}'

        # Process remaining tokens for the current label
        for i in range(1, len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', entry['text']))):
            if start_token_index + i >= len(bio_labels):
                print(f"Index out of bounds while assigning I-{label_type} for index: {start_token_index + i}")
                break  # Exit loop if out of bounds
            
            bio_labels[start_token_index + i] = f'I-{label_type}'

    return list(zip(tokens, bio_labels))

# Process each JSON object and convert to BIO format
def process_json_file(filename):
    data_entries = read_data_from_json(filename)
    
    all_bio_formats = []
    
    for index, data in enumerate(data_entries):
        print(f"Processing entry {index}: {data}")  # Debug output
        if isinstance(data, dict):
            print(f"Keys found: {list(data.keys())}")  # Show the keys for each entry
            
            # Use the correct keys based on your findings
            if 'value' in data and 'metrics' in data:
                bio_format = convert_to_bio(data)
                all_bio_formats.extend(bio_format)
            else:
                print(f"Skipping entry due to missing 'value' or 'metrics': {data}")

    if not all_bio_formats:
        print("No valid entries found for BIO conversion.")

    return all_bio_formats

# Read and process the data from the JSON file
filename = '/Users/roselynnnn/Downloads/project-1.json'
bio_format = process_json_file(filename)

# Save the BIO format to CSV and text files
def save_to_csv(bio_format, filename):
    df = pd.DataFrame(bio_format, columns=['Token', 'BIO Label'])
    df.to_csv(filename, index=False)

def save_to_text(bio_format, filename):
    with open(filename, 'w') as f:
        for token, label in bio_format:
            f.write(f"{token}\t{label}\n")

# Save to files
save_to_csv(bio_format, 'bio_output_1.csv')
save_to_text(bio_format, 'bio_output_1.txt')

print("BIO format saved to bio_output_1.csv and bio_output_1.txt.")


Processing entry 0: {'value': '1 ', 'id': 1, 'annotator': 1, 'annotation_id': 27, 'created_at': '2024-10-30T12:29:28.924502Z', 'updated_at': '2024-10-30T12:29:28.924502Z', 'lead_time': 2.359}
Keys found: ['value', 'id', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time']
Skipping entry due to missing 'value' or 'metrics': {'value': '1 ', 'id': 1, 'annotator': 1, 'annotation_id': 27, 'created_at': '2024-10-30T12:29:28.924502Z', 'updated_at': '2024-10-30T12:29:28.924502Z', 'lead_time': 2.359}
Processing entry 1: {'value': 'Net profit (S$ million) ', 'id': 44, 'metrics': [{'start': 0, 'end': 10, 'text': 'Net profit', 'labels': ['metric']}, {'start': 12, 'end': 22, 'text': 'S$ million', 'labels': ['unit']}], 'annotator': 1, 'annotation_id': 1, 'created_at': '2024-10-30T11:36:08.419250Z', 'updated_at': '2024-10-30T11:36:08.419250Z', 'lead_time': 120.997}
Keys found: ['value', 'id', 'metrics', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time']
Processin

In [None]:
import re
import json
import pandas as pd
import os

def read_data_from_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)
    
def convert_to_bio(data):
    text = data.get("value", "")
    labels = data.get("metrics", [])
    
    # Define a regex pattern to capture numbers with commas, decimal numbers, units, and words
    tokens = re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text)
    
    # Filter out unwanted punctuation tokens
    tokens = [token for token in tokens if token not in ['.', '/']]

    bio_labels = ['O'] * len(tokens)

    for entry in labels:
        start = entry['start']
        end = entry['end']
        label_type = entry['labels'][0]

        # Validate start and end indices
        if start < 0 or end > len(text):
            print(f"Skipping entry with invalid indices: {entry}")
            continue
        
        # Calculate the start token index
        start_token_index = len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', text[:start]))
        
        # Validate start_token_index
        if start_token_index < 0 or start_token_index >= len(tokens):
            print(f"Invalid start token index: {start_token_index} for text: {text}")
            continue
        
        bio_labels[start_token_index] = f'B-{label_type}'

        # Process remaining tokens for the current label
        for i in range(1, len(re.findall(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?|\b\d+\.\d+|\b\d+|(?:\(\w+/\w+\))|(?:\w+(?:\w+)?)', entry['text']))):
            if start_token_index + i >= len(bio_labels):
                print(f"Index out of bounds while assigning I-{label_type} for index: {start_token_index + i}")
                break  # Exit loop if out of bounds
            
            bio_labels[start_token_index + i] = f'I-{label_type}'

    return list(zip(tokens, bio_labels))

# Process each JSON object and convert to BIO format
def process_json_file(filename):
    data_entries = read_data_from_json(filename)
    
    all_bio_formats = []
    
    for index, data in enumerate(data_entries):
        print(f"Processing entry {index}: {data}")  # Debug output
        if isinstance(data, dict):
            print(f"Keys found: {list(data.keys())}")  # Show the keys for each entry
            
            # Use the correct keys based on your findings
            if 'value' in data and 'metrics' in data:
                bio_format = convert_to_bio(data)
                all_bio_formats.extend(bio_format)
            else:
                print(f"Skipping entry due to missing 'value' or 'metrics': {data}")

    if not all_bio_formats:
        print("No valid entries found for BIO conversion.")

    return all_bio_formats

# Save to JSON file
def save_to_json(bio_format, filename):
    # Convert the bio_format to a list of dictionaries
    json_data = [{"Token": token, "BIO Label": label} for token, label in bio_format]
    
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=4)

# Process all JSON files in the input folder
def process_folder(input_folder, output_folder):
    # Create output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Loop through all JSON files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            input_file_path = os.path.join(input_folder, filename)
            print(f"Processing file: {input_file_path}")
            bio_format = process_json_file(input_file_path)

            # Save the BIO format to a JSON file in the output folder
            output_file_path = os.path.join(output_folder, f'bio_{filename}')
            save_to_json(bio_format, output_file_path)

            print(f"SAVED: {output_file_path}")

# Define input and output folders
input_folder = '/Users/roselynnnn/Downloads/Raw_Json'  # Replace with your input folder path
output_folder = '/Users/roselynnnn/Downloads/Raw_Json_Bio'  # Replace with your output folder path

# Process the folder
process_folder(input_folder, output_folder)

Processing file: /Users/roselynnnn/Downloads/Raw_Json/project-2-at-2024-11-07-21-01-a53c81a7.json
Processing entry 0: {'value': 'In 2023, there were no workers with high risk of diseases related to their occupation.', 'id': 1692, 'metrics': [{'start': 31, 'end': 85, 'text': 'with high risk of diseases related to their occupation', 'labels': ['metric']}, {'start': 20, 'end': 22, 'text': 'no', 'labels': ['value']}], 'annotator': 1, 'annotation_id': 96, 'created_at': '2024-11-06T11:37:53.895741Z', 'updated_at': '2024-11-06T11:37:53.895741Z', 'lead_time': 11.289}
Keys found: ['value', 'id', 'metrics', 'annotator', 'annotation_id', 'created_at', 'updated_at', 'lead_time']
Processing entry 1: {'value': 'Percentage of employees at the manufacturing division of Haw Par Healthcare Singapore covered by a collective bargaining agreement: 2023 2022 2021 67% 65% 77% Supply Chain To guide and   Committed to an ethical and accountable procurement process that maintains encourage integrity and fairnes