In [55]:
import os
import json
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv
 
load_dotenv()

True

In [102]:
json_dir = './json/content'
section_names = [x.replace('.json','') for x in os.listdir(json_dir) if '.json' in x]

docs = []
for i,section_name in enumerate(section_names):
    with open(f'{json_dir}/{section_name}.json', 'r') as f:
        # Parsing the JSON file into a Python dictionary
        content_list = json.load(f)
        for j,content in enumerate(content_list):
            text_content = f"Section: {section_name}\n\nTitle: {content['title']}\n\nContent: {content['content']}"
            metadata = {
                "section": section_name,
                "title": content['title'],
                "doc_id": f"section{i}-title{j}"
            }
            doc = Document(
                                page_content=text_content,
                                metadata=metadata
                            )
            docs.append(doc)

In [103]:
recursive_splitter = RecursiveCharacterTextSplitter(separators=['\n\n','\n','. ','?','; ',':',', '], 
                                                    chunk_size=100,
                                                    chunk_overlap=0)

In [106]:
chunks = recursive_splitter.split_documents(docs)
print(len(chunks))

15840


In [105]:
for i, chunk in enumerate(chunks[100:110]):
    print(f"Chunk {i}: len: {len(chunk.page_content)}")
    print(chunk.page_content)
    print(chunk.metadata)

Chunk 0: len: 16
. Lock-up period
{'section': 'Copy Trading', 'title': 'How to Lead Trade on Binance Futures?', 'doc_id': 'section0-title1'}
Chunk 1: len: 103
: The feature will be automatically unlocked after the portfolio has been fully copied with 200 copiers
{'section': 'Copy Trading', 'title': 'How to Lead Trade on Binance Futures?', 'doc_id': 'section0-title1'}
Chunk 2: len: 78
. You can click the edit button to set your lock-up period to 7D, 14D and 30D.
{'section': 'Copy Trading', 'title': 'How to Lead Trade on Binance Futures?', 'doc_id': 'section0-title1'}
Chunk 3: len: 50
How does risk management work for lead portfolios?
{'section': 'Copy Trading', 'title': 'How to Lead Trade on Binance Futures?', 'doc_id': 'section0-title1'}
Chunk 4: len: 72
1. Portfolio Risk Management for Public Portfolios:
Leverage Limitation:
{'section': 'Copy Trading', 'title': 'How to Lead Trade on Binance Futures?', 'doc_id': 'section0-title1'}
Chunk 5: len: 93
When a portfolio’s AUM (Assets Under M

In [107]:
"""Create and persist ChromDB vector store"""
persist_directory = "db/Chroma_db"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Create vector store Chroma-DB
print("-- Creating Vector Store")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space":"cosine"}
)
print("-- Finish Creating Vector Store")
print(f"Vector Store created and saved to {persist_directory}")

-- Creating Vector Store
-- Finish Creating Vector Store
Vector Store created and saved to db/Chroma_db
