# Experiments

## Imports

In [4]:
# Imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [1]:
# Langchain
from langchain_core.runnables import  RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    Docx2txtLoader,
    UnstructuredWordDocumentLoader,
    JSONLoader,
    UnstructuredMarkdownLoader
)
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
### ChatModels (https://python.langchain.com/docs/integrations/chat/)
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic

### Embeddings
# import chormadb
# from langchain_ollama import OllamaEmbeddings
# from langchain_chroma import Chroma
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
print ("Current working directory:" , os.getcwd())

Current working directory: z:\Git_PhD\ChatDocumentRAG


## Data Loaders

- Documentation at https://python.langchain.com/docs/integrations/document_loaders/
- Download data from https://huggingface.co/datasets

### Text file

In [3]:
loader = TextLoader(file_path="data/sample_docs/sample.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=24)
documents = text_splitter.split_documents(documents=docs)

print(type(documents))
documents

<class 'list'>


[Document(metadata={'source': 'data/sample_docs/sample.txt'}, page_content='Hello,'),
 Document(metadata={'source': 'data/sample_docs/sample.txt'}, page_content='On 22 Jan, 2024 we had a flight from Kolkata (CCU) to Stockholm (ARN) (PNR  JV418P), with connections in Mumbai (BOM) and Istanbul (IST). The flight from Mumbai to Istanbul was delayed 8 hours. It was supposed to leave around 06:55 a.m., however it'),
 Document(metadata={'source': 'data/sample_docs/sample.txt'}, page_content='06:55 a.m., however it left at 14:30 on 23 Jan, 2024. This delay cost us numerous damages, including financial, academic, and work-related, which are described below.'),
 Document(metadata={'source': 'data/sample_docs/sample.txt'}, page_content='1. We missed our flight to Stockholm (ARN) on January 23, 2024 from Istanbul.\n2. We also missed our pre-booked train from Stockholm to our home on the same day (cost: 1626 SEK squandered).'),
 Document(metadata={'source': 'data/sample_docs/sample.txt'}, page_cont

### PDF file

In [4]:
loader1 = PyPDFLoader(file_path="data/sample_docs/sample.pdf")
docs = loader1.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=24)
documents = text_splitter.split_documents(documents=docs)

print(type(documents))
documents

<class 'list'>


[Document(metadata={'producer': 'pdfTeX', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-11-01T07:58:19+00:00', 'gts_pdfa1version': 'PDF/A-1b:2005', 'moddate': '2024-11-01T07:58:19+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'trapped': '/False', 'source': 'data/sample_docs/sample.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='MARISSA MAYER\nBusiness Woman & Proud Geek\n@ mmayer@yahoo-inc.com /envel⌢peAddress, Street, 00000 County ♂¶ap-¶arkerSunnyvale, CA\n/gl⌢bemarissamayr.tumblr.com /xicon@marissamayer /linkedinmarissamayer\nEXPERIENCE\nPresident & CEO\nYahoo!'),
 Document(metadata={'producer': 'pdfTeX', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-11-01T07:58:19+00:00', 'gts_pdfa1version': 'PDF/A-1b:2005', 'moddate': '2024-11-01T07:58:19+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'trapped': '/False

### JSON file
- https://www.restack.io/docs/langchain-knowledge-json-loader-schema-cat-ai

In [5]:
import json
from pathlib import Path
from pprint import pprint


file_path='data/sample_docs/sample.json'
data = json.loads(Path(file_path).read_text())
data

[{'state': 'Alabama',
  'slug': 'alabama',
  'code': 'AL',
  'nickname': 'Yellowhammer State',
  'website': 'http://www.alabama.gov',
  'admission_date': '1819-12-14',
  'admission_number': 22,
  'capital_city': 'Montgomery',
  'capital_url': 'http://www.montgomeryal.gov',
  'population': 4833722,
  'population_rank': 23,
  'constitution_url': 'http://alisondb.legislature.state.al.us/alison/default.aspx',
  'state_flag_url': 'https://cdn.civil.services/us-states/flags/alabama-large.png',
  'state_seal_url': 'https://cdn.civil.services/us-states/seals/alabama-large.png',
  'map_image_url': 'https://cdn.civil.services/us-states/maps/alabama-large.png',
  'landscape_background_url': 'https://cdn.civil.services/us-states/backgrounds/1280x720/landscape/alabama.jpg',
  'skyline_background_url': 'https://cdn.civil.services/us-states/backgrounds/1280x720/skyline/alabama.jpg',
  'twitter_url': 'https://twitter.com/alabamagov',
  'facebook_url': 'https://www.facebook.com/alabamagov'},
 {'state':

In [7]:
loader3 = JSONLoader(
    file_path='data/sample_docs/sample.json',
    jq_schema='.[]', # Must know the structure of the json data based on the above cell. OR .[] | {state, code}
    text_content=False,)
docs = loader3.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
documents = text_splitter.split_documents(documents=docs)

print(type(documents))
documents

<class 'list'>


[Document(metadata={'source': 'Z:\\Git_PhD\\ChatDocumentRAG\\data\\sample_docs\\sample.json', 'seq_num': 1}, page_content='{"state": "Alabama", "slug": "alabama", "code": "AL", "nickname": "Yellowhammer State", "website": "http://www.alabama.gov", "admission_date": "1819-12-14", "admission_number": 22, "capital_city": "Montgomery", "capital_url": "http://www.montgomeryal.gov", "population": 4833722, "population_rank": 23, "constitution_url": "http://alisondb.legislature.state.al.us/alison/default.aspx", "state_flag_url": "https://cdn.civil.services/us-states/flags/alabama-large.png", "state_seal_url": "https://cdn.civil.services/us-states/seals/alabama-large.png", "map_image_url": "https://cdn.civil.services/us-states/maps/alabama-large.png", "landscape_background_url": "https://cdn.civil.services/us-states/backgrounds/1280x720/landscape/alabama.jpg", "skyline_background_url": "https://cdn.civil.services/us-states/backgrounds/1280x720/skyline/alabama.jpg", "twitter_url": "https://twitt

## Ollama LLM Models (Local)

- Download models from https://ollama.com/library or https://huggingface.co/models
- Run `ollama list` to see available models
- Run `ollama pull llama3.2` to download the `llama3.2` model (could be deepseek, mistral, etc.)
- Run `ollama run llama3.2` to start the model

In [8]:
llm_type = os.getenv("LLM_TYPE", "ollama")
llm_type 

'ollama'

In [5]:
!ollama list

NAME                        ID              SIZE      MODIFIED     
mxbai-embed-large:latest    468836162de7    669 MB    24 hours ago    
deepseek-r1:8b              28f8fd6cdc67    4.9 GB    9 days ago      
nomic-embed-text:latest     0a109f422b47    274 MB    3 weeks ago     
llama3.2:latest             a80c4f17acd5    2.0 GB    3 weeks ago     


In [17]:
if llm_type == "ollama":
    llm = ChatOllama(model="llama3.2", temperature=0)
    print("Llama the wise")
elif llm_type == "openai":
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) # Fallback
    print("Hi, Claude Shannon here")
else:
    llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0) # Fallback 2
    print("Hi, Claude Shannon here")

llm_transformer = LLMGraphTransformer(llm=llm)
# graph_documents = llm_transformer.convert_to_graph_documents(documents)

Llama the wise


## Embeddings

- [Chorma Vector Store](https://python.langchain.com/docs/integrations/vectorstores/chroma/)

In [27]:
# import chromadb
# # from langchain_community.vectorstores import Chroma
# from langchain_chroma import Chroma
# from langchain_ollama import OllamaEmbeddings

In [9]:
EMBEDDING_MODELS = [
    {'name':'mxbai-embed-large', 'dimensions': 1024},
    {'name':'nomic-embed-text','dimensions': 768},
    {'name':'bge-m3','dimensions': 1024},
]
 
for model in EMBEDDING_MODELS:
    print(model['name'], model['dimensions'])

mxbai-embed-large 1024
nomic-embed-text 768
bge-m3 1024


In [11]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
)
embeddings

OllamaEmbeddings(model='mxbai-embed-large', base_url=None, client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None)

In [12]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="data/vector_store_1",  # Where to save data locally, remove if not necessary
)

In [13]:
import chromadb
# Initialize ChromaDB client with explicit settings
chroma_client = chromadb.PersistentClient(
    path="data/vector_store_client",
    settings=chromadb.Settings(
    allow_reset=True,
    is_persistent=True)
)
collection = chroma_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=chroma_client,
    collection_name="collection_name",
    embedding_function=embeddings,
)

In [14]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(documents))]
uuids

['9e5096cd-268b-497d-995e-25ce61b78374',
 'bebaa33a-c6d6-480e-875e-044102d9aded',
 'f970c691-fe5b-4b7b-ac48-087e013bbf79',
 '87799f0a-8e2a-49c2-aaeb-80ec577c3a8c',
 'd0ab46b1-e775-40a3-aa94-1c3ffd764904',
 '2be2ae0d-6b7e-4fac-ad3e-f88c62233b4b',
 'dd52a686-427f-4655-b82b-ae584e1f50a3',
 '5907ac56-4823-477d-92af-49d21cbb789a',
 'bb44414b-6f8a-43d9-8ddd-9773fdbbb557',
 '7ee7a3f5-7457-41dd-9f63-4b12cf98bf88',
 'db1c1663-1843-4893-95ff-6c733910d43d',
 '7fff1242-9b5e-4a54-9dde-41d47b3cc5d6',
 '03bf3051-56fb-44de-b391-7aa75cf2d5c6',
 '0dc81223-cca2-44dc-9ca6-01d6ec0780e3',
 '282714f6-d554-4b2f-b283-5341aaa6fdf3',
 'fcac49af-b88c-4738-916d-fa1e02ff2c53',
 '28da72f3-a344-4f38-99c6-c0d94ac0d6ae',
 '4e841751-7134-445b-9105-e8e5e44e6392',
 '10c7299f-6bea-4d4c-8a72-000224ca07ff',
 '746f4a55-df53-4bdd-b8a6-c57b5ca03a0f',
 '1b43a003-f39e-4068-8afd-67ed05b0c817',
 'fc2aec64-ef79-4eff-aed3-743db7c27d3d',
 'c66399f6-f9a6-4389-ab2e-60287815f1ad',
 'df9b3890-330d-4c19-a0b4-97b55655265c',
 '6743877c-1285-