##### JSON parsing and Processing

In [1]:
from langchain_community.document_loaders import JSONLoader
import json

###### Method:1 JSONLoader with jq_schema

In [3]:
employee_loader = JSONLoader(
    file_path='data/json_files/company_data.json',
    jq_schema='.employees[]',
    text_content=False
)    


employee_docs = employee_loader.load()
print(f"Loaded {len(employee_docs)} employee documents")
print(f"First employee: {employee_docs[0].page_content[:200]}")

Loaded 2 employee documents
First employee: {"id": 1, "name": "John Doe", "role": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status"


In [4]:
print(employee_docs)

[Document(metadata={'source': '/home/bhupen/Python_Learning/RAG/DataIngestParsing/data/json_files/company_data.json', 'seq_num': 1}, page_content='{"id": 1, "name": "John Doe", "role": "Software Engineer", "skills": ["Python", "JavaScript", "React"], "projects": [{"name": "RAG System", "status": "In Progress"}, {"name": "Data Pipeline", "status": "Completed"}]}'), Document(metadata={'source': '/home/bhupen/Python_Learning/RAG/DataIngestParsing/data/json_files/company_data.json', 'seq_num': 2}, page_content='{"id": 2, "name": "Jane Smith", "role": "Data Scientist", "skills": ["Python", "Machine Learning", "SQL"], "projects": [{"name": "ML Model", "status": "In Progress"}, {"name": "Analytics Dashboard", "status": "Planning"}]}')]


###### Method:2 Custom JSON parsing for Complex structures

In [5]:
from langchain_core.documents import Document

In [15]:
def process_json_intelligently(file_path:str):
    with open(file_path,'r') as f:
        data = json.load(f)

    documents = []

    for emp in data.get('employees',[]):
        content = f"""Employee Profile:
        Name : {emp['name']}
        Role: {emp['role']}
        Skills: {','.join(emp['skills'])}
        
        Projects:"""
        for proj in emp.get('projects',[]):
            content += f"\n- {proj['name']} (Status: {proj['status']})"

        doc = Document(
            page_content=content,
            metadata={
                'source':file_path,
                'data_type':'employee_profile',
                'employee_id':emp['id'],
                'employee_name':emp['name'],
                'role':emp['role']
            }
        )

        documents.append(doc)
    return documents
    

In [16]:
process_json_intelligently('data/json_files/company_data.json')

[Document(metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_profile', 'employee_id': 1, 'employee_name': 'John Doe', 'role': 'Software Engineer'}, page_content='Employee Profile:\n        Name : John Doe\n        Role: Software Engineer\n        Skills: Python,JavaScript,React\n\n        Projects:\n- RAG System (Status: In Progress)\n- Data Pipeline (Status: Completed)'),
 Document(metadata={'source': 'data/json_files/company_data.json', 'data_type': 'employee_profile', 'employee_id': 2, 'employee_name': 'Jane Smith', 'role': 'Data Scientist'}, page_content='Employee Profile:\n        Name : Jane Smith\n        Role: Data Scientist\n        Skills: Python,Machine Learning,SQL\n\n        Projects:\n- ML Model (Status: In Progress)\n- Analytics Dashboard (Status: Planning)')]