In [4]:
import os
import json
import pandas as pd

path = "database/qwen06b_chunks_export.jsonl"

print("directory listing:", os.listdir("database"))
print("path exists:", os.path.exists(path))
if os.path.exists(path):
    print("file size bytes:", os.path.getsize(path))

# Show raw first non-empty lines (as bytes) to detect BOM / non-json content / array starts
if os.path.exists(path):
    with open(path, "rb") as f:
        shown = 0
        while shown < 5:
            raw = f.readline()
            if not raw:
                break
            if not raw.strip():
                continue
            print(f"RAW LINE {shown+1}:", repr(raw[:200]))
            shown += 1

# Try parsing the first non-empty line with json.loads to get the exact error
if os.path.exists(path):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
                print("First non-empty line parsed OK. Keys:", list(obj.keys())[:10])
            except Exception as e:
                print("json.loads error on first non-empty line:", repr(e))
            break


# Robust JSONL reader: falls back to per-line json.loads and skips malformed lines
def read_jsonl(path, nrows=None, skip_bad=True):
    rows = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                rows.append(json.loads(line))
            except Exception as e:
                if not skip_bad:
                    raise
            if nrows and len(rows) >= nrows:
                break
    return pd.DataFrame.from_records(rows)


# Use the robust reader (nrows=10). If this works, the file contains valid NDJSON lines but pandas.read_json failed due to e.g. stray characters.
try:
    df = read_jsonl(path, nrows=10)
    print("Loaded with fallback reader. shape=", df.shape)
    print(df.head().to_dict())
except Exception as e:
    print("Fallback reader failed:", repr(e))

# If you prefer pandas.read_json and the file is valid NDJSON, this should work:
try:
    df2 = pd.read_json(path, lines=True, nrows=10)
    print("pandas.read_json succeeded. shape=", df2.shape)
except Exception as e:
    print("pandas.read_json error:", repr(e))

directory listing: ['.DS_Store', 'milvusdb.py', '__init__.py', '__pycache__', 'create_collections.sh', 'pg', 'logs', 'milvus.yaml', 'qwen06b_chunks_export.jsonl', 'milvus']
path exists: True
file size bytes: 11137830167
RAW LINE 1: b"{'text': 'Detection of Abundant CO<SUB>2</SUB> Ice in the Quiescent Dark Cloud Medium toward Elias 16', 'doi': '10.1086/311318', 'citation_count': 154, 'pubdate': 19980501, 'vector': [-0.0254653915762"
RAW LINE 2: b"{'text': 'Abstract: We report the first detection of solid carbon dioxide (CO<SUB>2</SUB>) in quiescent regions of a dark cloud in the solar neighborhood, a result that has important implications for "
RAW LINE 3: b"{'text': '<P />Based on observations with Infrared Space Observatory, a European Space Agency (ESA) project, with instruments funded by ESA Member States (especially the PI countries France, Germany, "
RAW LINE 4: b'{\'text\': "1. INTRODUCTION Icy mantles are an important component of the interstellar dust in molecular clouds (see C

In [7]:
with open('database/qwen06b_chunks_export.jsonl', 'r') as file:
    line = file.readline()
    print(line)
    entity = json.loads(line)
    print(entity.keys())


{'text': 'Detection of Abundant CO<SUB>2</SUB> Ice in the Quiescent Dark Cloud Medium toward Elias 16', 'doi': '10.1086/311318', 'citation_count': 154, 'pubdate': 19980501, 'vector': [-0.02546539157629013, -0.03687753528356552, -0.01394884567707777, 0.022668389603495598, 0.005709933582693338, 0.048855461180210114, -0.03132718428969383, -0.06980938464403152, -0.04449668154120445, -0.049390338361263275, 0.02322133630514145, 0.00044834346044808626, -0.038608938455581665, -0.0142024215310812, -0.05647306516766548, 0.09589236229658127, 0.01706922985613346, 0.08217199891805649, 0.01686396449804306, -0.012182929553091526, -0.07618838548660278, 0.005816952791064978, -0.048858750611543655, 0.10654010623693466, 0.07222149521112442, -0.023019568994641304, -0.009670156054198742, 0.03942923992872238, 0.0009249498252756894, -0.007472061552107334, 0.06820102781057358, 0.03146359324455261, -0.020698653534054756, -0.014889907091856003, 0.019375456497073174, -0.01727404072880745, -0.0510011725127697, 0.

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)