# JSON


https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/json/

In [1]:
import sys
sys.path.append('../../')

import json
from pathlib import Path
from pprint import pprint

from document_loaders.load_document import load_document, load_document_lazy, DocumentLoader

## Using load_document

In [14]:
file_path='./files/some_json.json'
pprint(json.loads(Path(file_path).read_text()))

{'web-app': {'servlet': [{'init-param': {'cachePackageTagsRefresh': 60,
                                         'cachePackageTagsStore': 200,
                                         'cachePackageTagsTrack': 200,
                                         'cachePagesDirtyRead': 10,
                                         'cachePagesRefresh': 10,
                                         'cachePagesStore': 100,
                                         'cachePagesTrack': 200,
                                         'cacheTemplatesRefresh': 15,
                                         'cacheTemplatesStore': 50,
                                         'cacheTemplatesTrack': 100,
                                         'configGlossary:adminEmail': 'ksm@pobox.com',
                                         'configGlossary:installationAt': 'Philadelphia, '
                                                                          'PA',
                                         'configGlossary:

In [3]:
docs = load_document('./files/some_json.json', text_splitter="auto")

print(len(docs))
pprint(docs[1].metadata)
pprint(json.loads(docs[1].page_content))

2
{'seq_num': 1,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/some_json.json'}
{'web-app': {'servlet-mapping': {'cofaxAdmin': '/admin/*',
                                 'cofaxCDS': '/',
                                 'cofaxEmail': '/cofaxutil/aemail/*',
                                 'cofaxTools': '/tools/*',
                                 'fileServlet': '/static/*'},
             'taglib': {'taglib-location': '/WEB-INF/tlds/cofax.tld',
                        'taglib-uri': 'cofax.tld'}}}


### Customizing the JSON parsing and loading

See the jq module documentation for more information of what jq schemas are supported:
https://jqlang.github.io/jq/

In [13]:
file_path='./files/facebook_chat.json'
pprint(json.loads(Path(file_path).read_text()))

{'image': {'creation_timestamp': 1675549016, 'uri': 'image_of_the_chat.jpg'},
 'is_still_participant': True,
 'joinable_mode': {'link': '', 'mode': 1},
 'magic_words': [],
 'messages': [{'content': 'Bye!',
               'sender_name': 'User 2',
               'timestamp_ms': 1675597571851},
              {'content': 'Oh no worries! Bye',
               'sender_name': 'User 1',
               'timestamp_ms': 1675597435669},
              {'content': 'No Im sorry it was my mistake, the blue one is not '
                          'for sale',
               'sender_name': 'User 2',
               'timestamp_ms': 1675596277579},
              {'content': 'I thought you were selling the blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595140251},
              {'content': 'Im not interested in this bag. Im interested in the '
                          'blue one!',
               'sender_name': 'User 1',
               'timestamp_ms': 1675595109305},
   

In [5]:
docs = load_document(
            './files/facebook_chat.json',
            jq_schema='.messages[].content'
        )

print(len(docs))
pprint(docs[10].metadata)
pprint(docs[10].page_content)

11
{'seq_num': 11,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/facebook_chat.json'}
('Hi! Im interested in your bag. Im offering $50. Let me know if you are '
 'interested. Thanks!')


#### JSON Lines file

f you want to load documents from a JSON Lines file, you pass `json_lines=True` and specify`jq_schema` to extract `page_conten` from a single JSON object.

In [6]:
file_path = './files/facebook_chat_messages.jsonl'
pprint(Path(file_path).read_text())

('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
 '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
 'worries! Bye"}\n'
 '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
 'sorry it was my mistake, the blue one is not for sale"}\n')


In [7]:
docs = load_document(
            './files/facebook_chat_messages.jsonl',
            jq_schema='.content',
            json_lines=True
        )

print(len(docs))
pprint(docs[2].metadata)
pprint(docs[2].page_content)

3
{'seq_num': 3,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/facebook_chat_messages.jsonl'}
'No Im sorry it was my mistake, the blue one is not for sale'


## Using load_document_lazy

In [15]:
count = 0
for doc in load_document_lazy('./files/some_json.json', text_splitter="auto"):
    count += 1

print(count)
pprint(doc.metadata)
pprint(json.loads(doc.page_content))

2
{'seq_num': 1,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/some_json.json'}
{'web-app': {'servlet-mapping': {'cofaxAdmin': '/admin/*',
                                 'cofaxCDS': '/',
                                 'cofaxEmail': '/cofaxutil/aemail/*',
                                 'cofaxTools': '/tools/*',
                                 'fileServlet': '/static/*'},
             'taglib': {'taglib-location': '/WEB-INF/tlds/cofax.tld',
                        'taglib-uri': 'cofax.tld'}}}


#### JSON Lines file

In [16]:
count = 0
for doc in load_document_lazy(
            './files/facebook_chat_messages.jsonl',
            jq_schema='.content',
            json_lines=True
        ):
    count += 1

print(count)
pprint(doc.metadata)
pprint(doc.page_content)

3
{'seq_num': 3,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/facebook_chat_messages.jsonl'}
'No Im sorry it was my mistake, the blue one is not for sale'


## Using DocumentLoader

In [9]:
loader = DocumentLoader('./files/some_json.json', text_splitter="auto")

In [10]:
count = 0
for doc in loader.lazy_load():
    count += 1
    
print(count)
pprint(doc.metadata)
pprint(json.loads(doc.page_content))

2
{'seq_num': 1,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/some_json.json'}
{'web-app': {'servlet-mapping': {'cofaxAdmin': '/admin/*',
                                 'cofaxCDS': '/',
                                 'cofaxEmail': '/cofaxutil/aemail/*',
                                 'cofaxTools': '/tools/*',
                                 'fileServlet': '/static/*'},
             'taglib': {'taglib-location': '/WEB-INF/tlds/cofax.tld',
                        'taglib-uri': 'cofax.tld'}}}


#### JSON Lines file

In [11]:
loader = DocumentLoader(
            './files/facebook_chat_messages.jsonl',
            jq_schema='.content',
            json_lines=True
        )

In [12]:
count = 0
for doc in loader.lazy_load():
    count += 1
    
print(count)
pprint(doc.metadata)
pprint(doc.page_content)

3
{'seq_num': 3,
 'source': '/Users/antonio/Desktop/DataScience/MyCode/langchain-document_db/examples/document_loaders/files/facebook_chat_messages.jsonl'}
'No Im sorry it was my mistake, the blue one is not for sale'
