Install the required dependencies

In [1]:
!pip install --upgrade langchain openai  -q
!pip install sentence_transformers -q
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2 -q
!apt-get install poppler-utils
!pip install pinecone-client -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Directory Loader of Langchain

A directory loader is used to load a directory. It detects the number of files present in the diven directory.

In [6]:
from langchain.document_loaders import DirectoryLoader

directory = '/content/data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


1

## Text Splitter

This employs the `langchain.text_splitter` module to split a list of documents into smaller chunks. The function `split_docs` takes three arguments: `documents` (the list of documents to be split), `chunk_size` (determining the maximum size of each chunk, defaulting to 500 characters), and `chunk_overlap` (specifying the overlap between adjacent chunks, defaulting to 20 characters).

An instance of `RecursiveCharacterTextSplitter` is created with the specified `chunk_size` and `chunk_overlap` values, and the `split_documents` method of this instance is used to split the input documents into smaller chunks. The resulting chunks are returned, and the code prints the number of generated chunks using `print(len(docs))`.

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=500,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

8


In [9]:
print(docs[6].page_content)

**Challenges and Resilience:** Nepal's journey towards progress and development has not been without challenges. Political transitions, earthquakes, and economic struggles have shaped the nation's recent history. However, the resilience of the Nepalese people is evident in their ability to rebuild and preserve their cultural heritage in the face of adversity.


## Generating embeddings

It utilizes the `SentenceTransformerEmbeddings` module to create sentence embeddings using the "all-MiniLM-L6-v2" pre-trained model from the SentenceTransformer library.

In [10]:
# import openai
# from langchain.embeddings.openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model_name="ada")

from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [12]:
query_result = embeddings.embed_query("Large Language Model")
len(query_result)

384

## Pinecone Indexing

This involves using the Pinecone for creating and managing vector indexes.It initializes Pinecone with an API key and specifies the environment. It then defines an `index_name` for the vector index. Finally, it creates a Pinecone index named "langchain-chatbot" using the provided documents (`docs`) and corresponding embeddings (`embeddings`).


https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/pinecone.html

In [14]:
import pinecone
from langchain.vectorstores import Pinecone
# initialize pinecone
pinecone.init(
    api_key="",
    environment=""
)

index_name = "langchain-chatbot"

index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [15]:
def get_similiar_docs(query,k=1,score=False):
  if score:
    similar_docs = index.similarity_search_with_score(query,k=k)
  else:
    similar_docs = index.similarity_search(query,k=k)
  return similar_docs

query = "How is Nepal's progress"
similar_docs = get_similiar_docs(query)
similar_docs


[Document(page_content="**Challenges and Resilience:** Nepal's journey towards progress and development has not been without challenges. Political transitions, earthquakes, and economic struggles have shaped the nation's recent history. However, the resilience of the Nepalese people is evident in their ability to rebuild and preserve their cultural heritage in the face of adversity.", metadata={'source': '/content/data/nepal.txt'})]