In [1]:
import os
from langchain.document_loaders import CSVLoader, DirectoryLoader, JSONLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter


os.environ["TOKENIZERS_PARALLELISM"] = "false"
directory = "../data/google_drive_sahaj/All/Projects/ClearChannel"

In [2]:
docs = []
docs += DirectoryLoader(path=directory, glob="**/*[!(.json|.csv|.md)]", recursive=True, show_progress=True, silent_errors=True, loader_kwargs={"autodetect_encoding": True, "mode": "single"}).load()

docs[3]

  0%|                                                                                                                                                                                                                                                  | 0/14 [00:00<?, ?it/s]

  7%|████████████████▋                                                                                                                                                                                                                         | 1/14 [00:06<01:23,  6.43s/it]

 14%|█████████████████████████████████▍                                                                                                                                                                                                        | 2/14 [00:06<00:34,  2.90s/it]

 21%|██████████████████████████████████████████████████▏                                                                                                                                                                                       | 3/14 [00:07<00:18,  1.65s/it]

 36%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                      | 5/14 [00:08<00:09,  1.02s/it]

 43%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                     | 6/14 [00:08<00:06,  1.24it/s]

 50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                     | 7/14 [00:08<00:05,  1.39it/s]

 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 8/14 [00:09<00:03,  1.72it/s]

 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 9/14 [00:09<00:02,  2.16it/s]

 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 10/14 [00:09<00:01,  2.77it/s]

 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 10/14 [00:09<00:03,  1.06it/s]




Document(page_content='About Out of Home Industry (OOH):\n\nBasics of OOH advertising\n\nOOH advertising guide\n\nDSPs and SSPs in OOH\n\nProgrammatic OOH\n\nAn informative video on Programmatic and Direct OOH\n\nAbout Clearchannel:\n\nClearchannel website\n\nClearchannel post COVID\n\nLaunchPAD noise in the market\n\nOutcomes from our discovery workshop in July 2022:\n\nDiscovery July 2022\n\nProject related links:\n\nConfluence Home\n\nLaunchPAD confluence landing page - https://cci-clearchannel.atlassian.net/wiki/spaces/LAUNCHPAD/pages/3338108953/Welcome+to+the+LaunchPAD+workspace\n\nOld documentation on confluence:\n\nMacroEpics\n\nMarket Goals Mapping\n\nHeader Bidder ', metadata={'source': '../data/google_drive_sahaj/All/Projects/ClearChannel/Discovery July 2022/Useful Links.docx'})

In [3]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " ", ""], chunk_size=500, chunk_overlap=50)
all_splits = text_splitter.split_documents(docs)
print(f"Succesfully split {len(docs)} documents into {len(all_splits)} chunks...")
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
max_doc_length = lambda documents: max([len(doc.page_content) for doc in documents])
avg_char_count_pre = avg_doc_length(docs)
avg_char_count_post = avg_doc_length(all_splits)
print(f'Average length among {len(docs)} documents loaded is {avg_char_count_pre} characters.')
print(f'After the split we have {len(all_splits)} documents more than the original {len(docs)}.')
print(f'Average length among {len(all_splits)} documents (after split) is {avg_char_count_post} characters.')
print(f'Max length of split is {max_doc_length(all_splits)}.')

all_splits[2]

Succesfully split 10 documents into 254 chunks...
Average length among 10 documents loaded is 9723 characters.
After the split we have 254 documents more than the original 10.
Average length among 254 documents (after split) is 383 characters.
Max length of split is 499.


Document(page_content='1.What is LaunchPAD Programmatic ?\n\nLaunchPAD Programmatic is about enabling a new digital channel for sales of Digital OOH inventory, where customers (Media Agency, Specialists) can purchase inventory/ ad space with CCI through real time-buying with Brands, DSPs & SSPs with little or no negotiations.', metadata={'source': '../data/google_drive_sahaj/All/Projects/ClearChannel/Programmatic Vision Jan 2023/CCI_AlignmentWorkshop_Programmatic_15Feb23.pptx'})

In [24]:
import datetime
from typing import Optional
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores.utils import filter_complex_metadata
from langchain_core.vectorstores import VectorStoreRetriever


embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
docstore = Chroma(collection_name="personal_assistant", embedding_function=embedding, persist_directory="../chromadb/", collection_metadata={"timestamp": datetime.datetime.now().isoformat()})


def store_docs(documents):
    filtered_docs = filter_complex_metadata(documents)
    if len(docstore.get()['documents']) == 0:
        print(f"Storing {len(filtered_docs)} documents into db")
        docstore.add_documents(filtered_docs, show_progress=True)
    else:
        print("Skipping storing documents since vectore store already has documents.")


def delete_all_docs():
    print("Deleting collection")
    d = docstore.get()
    docstore.delete(d['ids']) if len(d['ids']) > 0 else None


In [5]:
store_docs(all_splits)
retriever = docstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})
retrieved_docs = retriever.get_relevant_documents("What is the Technology used(stack)?")
import pprint
pprint.pprint([d.page_content for d in retrieved_docs])

Storing 254 documents into db


['Performance\n'
 '\n'
 'enhancements, work together to make execution as streamlined and seamless as '
 'possible.',
 '#  High level user '
 'story                                                                                                             '
 'Existing?    In?\n'
 '  1  New container that is designed, tested and operates reliably across all '
 'of our tech stacks                                                     Y\n'
 '  2  New container that is well document (internal and external) and is '
 'simple for a “semi tech” person to connect a demand source to               '
 'Y']


In [6]:
retrieved_docs

[Document(page_content='Performance\n\nenhancements, work together to make execution as streamlined and seamless as possible.', metadata={'source': '../data/google_drive_sahaj/All/Projects/ClearChannel/Discovery July 2022/Resources/Clear Channel H1 Tech Review (1).pdf'}),
 Document(page_content='#  High level user story                                                                                                             Existing?    In?\n  1  New container that is designed, tested and operates reliably across all of our tech stacks                                                     Y\n  2  New container that is well document (internal and external) and is simple for a “semi tech” person to connect a demand source to               Y', metadata={'source': '../data/google_drive_sahaj/All/Projects/ClearChannel/Discovery July 2022/Resources/220725 LaunchPAD Priorities (1).pptx'})]

In [7]:
docs = []
docs += DirectoryLoader(path=directory, glob="**/*[!(.json|.csv|.md)]", recursive=True, show_progress=True, silent_errors=True, loader_kwargs={"autodetect_encoding": True, "mode": "elements"}).load()

  0%|                                                                                                                                                                                                                                                  | 0/14 [00:00<?, ?it/s]

  7%|████████████████▋                                                                                                                                                                                                                         | 1/14 [00:00<00:01,  8.04it/s]

 14%|█████████████████████████████████▍                                                                                                                                                                                                        | 2/14 [00:00<00:03,  3.21it/s]

 21%|██████████████████████████████████████████████████▏                                                                                                                                                                                       | 3/14 [00:00<00:02,  4.01it/s]

 36%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                      | 5/14 [00:01<00:03,  2.93it/s]

 43%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                     | 6/14 [00:01<00:02,  3.01it/s]

 50%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                     | 7/14 [00:02<00:02,  2.48it/s]

 57%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                    | 8/14 [00:02<00:02,  2.67it/s]

 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                   | 9/14 [00:02<00:01,  3.14it/s]

 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 10/14 [00:03<00:01,  3.87it/s]

 71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                  | 10/14 [00:03<00:01,  3.27it/s]




In [25]:
delete_all_docs()
store_docs(docs)

Deleting collection
Storing 1615 documents into db


In [26]:
from langchain.chains.query_constructor.base import AttributeInfo
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The full path of file for the document.",
        type="string",
    ),
    AttributeInfo(
        name="filename",
        description="The name of the file.",
        type="string",
    ),
    AttributeInfo(
        name="file_directory",
        description="The name of the directory in which this document resides.",
        type="string",
    ),
    AttributeInfo(
        name="parent_id",
        description="element hierarchy- may be used to infer where an element resides within the overall hierarchy of a document. for instance, a narrativetext element may have a title element as a parent (a “sub-title”), which in turn may have another title element as its parent (a “title).",
        type="string",
    ),
    AttributeInfo(
        name="category",
        description="The different components of a document- FigureCaption, NarrativeText, ListItem, Title, Address, Table, PageBreak, Header, Footer, UncategorizedText, Image and Formula",
        type="string",
    ),
]
document_content_description = "Information about projects in Sahaj."

In [27]:
from langchain.llms import LlamaCpp
llm = LlamaCpp(
    model_path="../models/llama-cpp/llama-2-13b-chat.Q4_K_M.gguf",
    temperature=0.5,
    max_tokens=3000,
    top_p=1,
    verbose=True,  # Verbose is required to pass to the callback manager
    n_ctx=3000,
    streaming=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../models/llama-cpp/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  5120,  5120,   

llama_new_context_with_model: kv self size  = 2343.75 MiB
llama_build_graph: non-view tensors processed: 924/924
llama_new_context_with_model: compute buffer total size = 7.29 MiB
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [28]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
retriever = SelfQueryRetriever.from_llm(
    llm,
    docstore,
    document_content_description,
    metadata_field_info,
)

ImportError: Cannot import lark, please install it with 'pip install lark'.