In [3]:
def extract_information_from_json(json_data=""):
    """
    Process JSON data to extract structured text information.

    Args:
        json_data (dict): The JSON data extracted from the ZIP file.

    Returns:
        dict: Structured data grouped by sections.
    """

    datas = json_data["elements"]

    # Initialize variables
    header = None
    sub_header = None
    value = []
    structured_data = {}

    combined_empty_sections = []
    heading_count = 0

    unmatched_texts = []

    for element in datas:
        path = element.get("Path", "")
        text = element.get("Text", "").strip()
        # text = preprocess_pipeline(text)
        # Ignore references and footnotes
        if '/Footnote' in path:
            continue

        # Handle headers (Title, H1, H2)
        elif '/Title' in path:
            # Combine empty sections into HEADING_X if previous sections were empty
            if not value and combined_empty_sections:
                heading_count += 1
                combined_title = f"HEADING_{heading_count}"
                structured_data[combined_title] = combined_empty_sections
                combined_empty_sections = []

            # Start a new header
            header = text
            sub_header = None
            if header not in structured_data:
                structured_data[header] = []

        elif "/H1" in path or ("/H2" in path and "/Figure" not in path):
            # print("text: ", text)
            if value:
                # Append accumulated content to the current section
                if sub_header:
                    structured_data[header].append(
                        {sub_header: " ".join(value)})
                else:
                    structured_data[header].append(" ".join(value))
                value = []

            # Update header and sub_header
            if "/H1" in path:
                header = text if text else "Untitled Section"
                if header not in structured_data:
                    structured_data[header] = []
                sub_header = None
            if "/H2" in path:
                sub_header = text if text else "Untitled Subsection"
                # print("text", text)
            else:
                structured_data[text] = []

        # Handle paragraphs and other text
        elif text:
            if header and not structured_data.get(header, []):
                # Initialize header with text
                structured_data[header] = [text]
            elif header:
                value.append(text)
            else:
                unmatched_texts.append(text)

    # Add the last accumulated data
    if value or header or sub_header:
        if header:
            if sub_header:
                if value:
                    structured_data[header].append(
                        {sub_header: " ".join(value)})
                else:
                    structured_data[header] = sub_header
            else:
                structured_data[header].append(" ".join(value))
        value = []

    # Combine remaining empty sections if any
    if combined_empty_sections:
        heading_count += 1
        combined_title = f"HEADING_{heading_count}"
        structured_data[combined_title] = combined_empty_sections

    # Combine unmatched texts into "Other"
    combined_key = "Other"
    combined_data = {combined_key: unmatched_texts}

    for key, value in structured_data.items():
        if isinstance(value, list) and not value:  # Empty lists
            combined_data[combined_key].append(key)
        else:
            combined_data[key] = value

    return combined_data

In [4]:
import json
file_path = "../backend/output/ExtractTextInfoFromPDF/extract2024-12-13T01-19-16/structuredData.json"

with open(file_path, "r") as f:
    data = json.load(f)


output = extract_information_from_json(data)

In [6]:
output

{'Other': ['ABHISHEK PANDEY',
  'abhi526691shek@gmail.com',
  'Linkedin',
  'Github',
  'Hackerrank',
  '20 doerr rd, M1P 3A1, ON (5*)'],
 'Experience': ['Jan 2022 - Sept 2023',
  {'FullStack Generative AI Developer': 'Dimensionless Technologies, Remote ● AUSHADHAI Built and deployed the reconciliation feature within , automating data alignment and discrepancy resolution across pharmacy systems, reducing reconciliation time by 40%. ● Developed scalable backend APIs using Django, integrated with AWS and Azure databases, resulting in a 35% improvement in system performance and data processing efficiency. ● PROPELPRO Led the automation of document processing and image recognition tasks in with AI-driven solutions, achieving a 40% increase in processing efficiency and contributing to 20% revenue growth through improved client deliverables. ● Designed pipelines for Q&A and summarization,leveraging Generative AI to improve large document analysis, enhancing user productivity by 30% and ensur

In [9]:
def flatten_values_to_string(data):
    """
    Convert all values in a dictionary (including nested structures) into a single flattened string.
    
    Parameters:
    - data (dict): The input dictionary with possibly nested or multiple items.
    
    Returns:
    - dict: A dictionary where values are flattened into strings.
    """
    result = {}

    def process_value(value):
        if isinstance(value, dict):
            # Convert dictionary to a single string by joining key-value pairs
            return "; ".join(f"{k}: {process_value(v)}" for k, v in value.items())
        elif isinstance(value, list):
            # Convert list to a single string by joining items
            return ", ".join(process_value(v) for v in value)
        else:
            # Return the value as a string
            return str(value)

    for key, value in data.items():
        result[key] = process_value(value)

    return result

flatten_values_to_string(output)

{'Other': 'ABHISHEK PANDEY, abhi526691shek@gmail.com, Linkedin, Github, Hackerrank, 20 doerr rd, M1P 3A1, ON (5*)',
 'Experience': 'Jan 2022 - Sept 2023, FullStack Generative AI Developer: Dimensionless Technologies, Remote ● AUSHADHAI Built and deployed the reconciliation feature within , automating data alignment and discrepancy resolution across pharmacy systems, reducing reconciliation time by 40%. ● Developed scalable backend APIs using Django, integrated with AWS and Azure databases, resulting in a 35% improvement in system performance and data processing efficiency. ● PROPELPRO Led the automation of document processing and image recognition tasks in with AI-driven solutions, achieving a 40% increase in processing efficiency and contributing to 20% revenue growth through improved client deliverables. ● Designed pipelines for Q&A and summarization,leveraging Generative AI to improve large document analysis, enhancing user productivity by 30% and ensuring scalability for data solut