In [1]:
!pip install -U chromadb
!pip install -U jsonlines
!pip install -U langchain
!pip install -U langchain-chroma
!pip install -U langchain-community
!pip install -U langchain-huggingface
!pip install -U sentence-transformers

Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downlo

In [2]:
import re
import json
import shutil
import jsonlines
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List 
from langchain_chroma import Chroma
from pydantic import BaseModel, Field
from transformers import AutoTokenizer
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader

# Loading the neccessary Files

In [3]:
cars = pd.read_csv("/kaggle/input/carchatbot/cln_car_var.csv")

cars['car_name'] = cars['car_name'].str.lower()
cars["brand"] = cars["brand"].str.replace(r"[-_]", " ", regex=True)
cars["car_name"] = cars["car_name"].str.replace(r"[-_]", " ", regex=True)

In [4]:
def metadata_tagger(docs, tag):
    multi_word_brands = {
        "aston martin": "aston martin",
        "land rover": "land rover",
        "mercedes benz": "mercedes",
        "rolls royce": "rolls royce"
    }

    for doc in docs:
        path = Path(doc.metadata["source"])
        filename = path.stem.lower()

        # Default values
        brand = None
        car_model = None

        # Handle multi-word brands
        for mw_brand, search_friendly in multi_word_brands.items():
            if filename.startswith(mw_brand):
                brand = search_friendly
                car_model = filename.replace(mw_brand, "").strip()
                break

        # Fallback to regular brand split
        if brand is None:
            parts = filename.split()
            brand = parts[0]
            car_model = " ".join(parts[1:])

        doc.metadata.clear()  # wipe all existing metadata including 'source'
        doc.metadata["type"] = tag
        doc.metadata["brand"] = brand
        doc.metadata["car_model"] = car_model

review_loader = DirectoryLoader(
    path="/kaggle/input/carchatbot/review", 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    show_progress=True
)
review_docs = review_loader.load()
metadata_tagger(review_docs, "review")

spec_loader = DirectoryLoader(
    path="/kaggle/input/carchatbot/spec", 
    glob="**/*.txt", 
    loader_cls=TextLoader,
    show_progress=True
)
specification_docs = spec_loader.load()
metadata_tagger(specification_docs, "specification")

general_loader = TextLoader("/kaggle/input/carchatbot/general.txt")
general_docs = general_loader.load()

for doc in general_docs:
    doc.metadata.pop("source", None)
    doc.metadata["type"] = "general"

100%|██████████| 137/137 [00:00<00:00, 178.52it/s]
100%|██████████| 253/253 [00:01<00:00, 209.65it/s]


# Text Splitting

In [5]:
chunk_counter = 0
def simple_line_splitter(docs: List[Document]) -> List[Document]:
    global chunk_counter
    new_docs = []
    
    for doc in docs:
        lines = doc.page_content.strip().split('\n')
        for line in lines:
            if line.strip():
                new_docs.append(
                    Document(
                        page_content=line.strip(),
                        metadata={**doc.metadata, "chunk_id": chunk_counter}
                    )
                )
                chunk_counter += 1
    
    return new_docs

spec_chunks = simple_line_splitter(specification_docs)
general_chunks = simple_line_splitter(general_docs)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def split_into_sentences(text: str) -> List[str]:
    return re.split(r'(?<=[.!?])\s+', text.strip())

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))

def chunk_document(doc: Document, max_tokens: int = 512) -> List[Document]:
    global chunk_counter
    paragraphs = doc.page_content.strip().split('\n\n')
    new_docs = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        token_count = count_tokens(para)

        if token_count <= max_tokens:
            new_docs.append(Document(page_content=para, metadata={**doc.metadata, "chunk_id": chunk_counter}))
            chunk_counter += 1
            continue

        lines = para.split('\n')
        heading = lines[0].strip()
        body = '\n'.join(lines[1:]).strip()
        sentences = split_into_sentences(body)

        first_chunk = True
        chunk, current_tokens = [], count_tokens(heading)

        for sentence in sentences:
            sentence_tokens = count_tokens(sentence)

            if current_tokens + sentence_tokens <= max_tokens:
                chunk.append(sentence)
                current_tokens += sentence_tokens
            else:
                if first_chunk:
                    content = f"{heading}\n{' '.join(chunk)}"
                    first_chunk = False
                else:
                    content = f"More on: {heading}\n{' '.join(chunk)}"

                # ✅ CORRECTED: Now using content, not para
                new_docs.append(Document(page_content=content, metadata={**doc.metadata, "chunk_id": chunk_counter}))
                chunk_counter += 1

                chunk = [sentence]
                current_tokens = count_tokens(f"More on: {heading}") + sentence_tokens

        if chunk:
            if first_chunk:
                content = f"{heading}\n{' '.join(chunk)}"
            else:
                content = f"More on: {heading}\n{' '.join(chunk)}"

            new_docs.append(Document(page_content=content, metadata={**doc.metadata, "chunk_id": chunk_counter}))
            chunk_counter += 1

    return new_docs


review_chunks = []
for doc in review_docs:
    review_chunks.extend(chunk_document(doc))

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2225 > 512). Running this sequence through the model will result in indexing errors


# Creating Vectore Store

In [7]:
docs = review_chunks + spec_chunks + general_chunks

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="chroma_db"
)
embedding_retriever = vectorstore.as_retriever()
shutil.make_archive('chroma_db', 'zip', 'chroma_db') # You can download the vector db from output

2025-05-01 14:20:46.320256: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746109246.520290      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746109246.576542      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

'/kaggle/working/chroma_db.zip'

# Saving the chunks and names of the car

In [8]:
car_names = cars["car_name"].unique().tolist()
docs = review_chunks + spec_chunks + general_chunks

with open("car_names.json", "w") as f:
    json.dump(car_names, f)

bm25_data_to_save = []
for doc in docs:
    bm25_data_to_save.append({
        "page_content": doc.page_content,
        "metadata": doc.metadata
    })

with open("bm25_search_docs.jsonl", "w") as f:
    for entry in bm25_data_to_save:
        f.write(json.dumps(entry) + "\n")


# Test

In [9]:
print(car_names[0])
print(bm25_data_to_save[0])

dzire
{'page_content': 'The Maruti WagonR is on sale in India at a price of between Rs. 4.20 - 5.70 lakhs (ex-Delhi).', 'metadata': {'type': 'review', 'brand': 'maruti', 'car_model': 'wagon r', 'chunk_id': 40341}}


In [10]:
docs = embedding_retriever.invoke("How many cars does mahindra has in india ?",k=5)

for doc in docs:
    print(doc)

page_content='Mahindra has 13 cars present in India: Scorpio N, Thar Roxx, Bolero, Xuv700, Scorpio, Thar, Xuv 3Xo, Be 6, Xev 9E, Bolero Neo, Bolero Camper, Xuv400 Ev & Bolero Neo Plus.' metadata={'chunk_id': 40311, 'type': 'general'}
page_content='Mahindra Be 6 variant Pack One has a driving range of 535 km.' metadata={'brand': 'mahindra', 'car_model': 'be 6', 'chunk_id': 6226, 'type': 'specification'}
page_content='All Mahindra Be 6 variants have a Rear-Wheel Drive (RWD) drive type.' metadata={'brand': 'mahindra', 'car_model': 'be 6', 'chunk_id': 6194, 'type': 'specification'}
page_content='Mahindra Be 6 variant Pack Three 79kWh has a driving range of 682 km.' metadata={'brand': 'mahindra', 'car_model': 'be 6', 'chunk_id': 6227, 'type': 'specification'}
page_content='Maserati has 6 cars present in India: Levante, Ghibli, Gran Turismo, Quattroporte, Grecale & Gran Cabrio.' metadata={'chunk_id': 40333, 'type': 'general'}


# ********** COMPLETED **********