In [1]:
import os
import json
import pandas as pd

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import DistanceStrategy

from transformers import AutoTokenizer

from src.process_html import (
    load_documents_from_jsonl,
)

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


# 1. Import Documents

Import cleaned Documents from notebook `1. data_cleaning.ipynb`.

In [2]:
data_dir = os.path.join('.', 'data')

In [3]:
# Reload saved files if not running from the beginning
filename = os.path.join(data_dir, "documents_{}.jsonl")

documents_sports = load_documents_from_jsonl(input_path=filename.format("sports"))
documents_finance = load_documents_from_jsonl(input_path=filename.format("finance"))
documents_movie = load_documents_from_jsonl(input_path=filename.format("movie"))

In [8]:
# A sample of Document
print(documents_sports[2])

page_content='PPG
14.3
RPG
3.0
APG
8.5
HEIGHT
6'3" (1.91m)
WEIGHT
178lb (81kg)
COUNTRY
Canada
LAST ATTENDED
Santa Clara
BIRTHDATE
February 7, 1974
DRAFT
1996 R1 Pick 15
EXPERIENCE
19 Years
6'3" | 178lb
DRAFT
1996 R1 Pick 15
BIRTHDATE
February 7, 1974
COUNTRY
Canada
LAST ATTENDED
Santa Clara
EXPERIENCE
19 Years
| Stat | Career |' metadata={'source': 'https://www.nba.com/player/959/steve-nash/', 'title': 'Steve Nash | Phoenix Suns', 'description': 'Steve Nash Stats and news - NBA stats and news on Phoenix Suns Guard Steve Nash', 'interaction_id': '7bb29eb4-12f9-45f9-bf8a-66832b3c8962'}


# 2. Setup Embedding Model

Might need to login Huggingface by runing `huggingface-cli login` in terminal.

In [4]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

  embedding_model = HuggingFaceEmbeddings(


In [5]:
# get tokenizer
embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)

# 3. Split Documents into Chunks

In [6]:
# Config
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

CHUNK_SIZE = 512 # max input length of EMBEDDING_MODEL
CUNK_OVERLAP = 64

In [7]:
spliter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=embedding_tokenizer,
    chunk_size=CHUNK_SIZE, 
    chunk_overlap=CUNK_OVERLAP,
    add_start_index=True,
    strip_whitespace=True,
    separators=MARKDOWN_SEPARATORS,
)

In [8]:
sport_chunks = spliter.split_documents(documents_sports)
finance_chunks = spliter.split_documents(documents_finance)
movie_chunks = spliter.split_documents(documents_movie)

In [19]:
# A sample of chunked Document
print(finance_chunks[4])

page_content='Microsoft Stock Filter Stocks by Fundamentals
| MSFT Stock | USD 402.65 12.27 2.96% |
Microsoft fundamentals help investors to digest information that contributes to Microsoft's financial success or failures. It also enables traders to predict the movement of Microsoft Stock. The fundamental analysis module provides a way to measure Microsoft's intrinsic value by examining its available economic and financial indicators, including the cash flow records, the balance sheet account changes, the income statement patterns, and various microeconomic indicators and financial ratios related to Microsoft stock.
Microsoft | Debt to Equity |
Microsoft Debt to Equity Analysis
| 2021 | 2022 | 2023 | 2024 (projected) | Capital Expenditures | 23.9B | 28.1B | 32.3B | 33.9B | Depreciation | 14.5B | 13.9B | 15.9B | 16.7B |
Microsoft Debt to Equity Driver Correlations
Understanding the fundamental principles of building solid financial models for Microsoft is extremely important. It helps t

In [9]:
# Create a unified chunk for evaluation usage
unified_chunk = sport_chunks + finance_chunks + movie_chunks

# 4. Create Vector Database

Create vector database for each of the three selected domains.

In [21]:
sports_vector_db = FAISS.from_documents(
    documents=sport_chunks,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

finance_vector_db = FAISS.from_documents(
    documents=finance_chunks,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

movie_vector_db = FAISS.from_documents(
    documents=movie_chunks,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

In [10]:
# create a unified vector db for evaluation usage
unified_vector_db = FAISS.from_documents(
    documents=unified_chunk,
    embedding=embedding_model,
    distance_strategy=DistanceStrategy.COSINE,
)

In [None]:
# save vector dbs to disk
db_dir = os.path.join(data_dir, "faiss")

sports_vector_db.save_local(
    os.path.join(db_dir, "sports")
)

finance_vector_db.save_local(
    os.path.join(db_dir, "finance")
)

movie_vector_db.save_local(
    os.path.join(db_dir, "movie")
)

In [11]:
unified_vector_db.save_local(
    os.path.join(db_dir, "unified")
)

NameError: name 'db_dir' is not defined