In [None]:
pip install sentence-transformers transformers faiss-cpu langchain chromadb langchain-community pypdf


Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-core<0.3.0,>=0.2.38 (from langchain)
  Downloading langchain_core-0.2.39-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.118-py3-none-any.whl.metadata (13 kB)
Collec

In [None]:
pip install huggingface_hub



In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader

os.makedirs('docs', exist_ok=True)

In [None]:
import faiss

In [None]:
documents = []
for file in os.listdir("docs"):
    if file.endswith(".pdf"):
        pdf_path = os.path.join("docs", file)
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = os.path.join("docs", file)
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = os.path.join("docs", file)
        loader = TextLoader(text_path)
        documents.extend(loader.load())

document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
document_chunks = document_splitter.split_documents(documents)

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

document_texts = [doc.page_content for doc in document_chunks]
document_embeddings = np.array([embedding_model.encode(doc) for doc in document_texts])

index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)
faiss.write_index(index, 'faiss_index.index')

with open('document_chunks.npy', 'wb') as f:
    np.save(f, document_texts)

print("Vector database created and stored successfully.")


Vector database created and stored successfully.


In [None]:
# for another one (optional)
document = []
for file in os.listdir("docs"):
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        document.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        document.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = "./docs/" + file
        loader = TextLoader(text_path)
        document.extend(loader.load())

document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
document_chunks = document_splitter.split_documents(document)
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
document_texts = [doc.page_content for doc in document_chunks]
document_embeddings = embeddings.embed_documents(document_texts)
document_embeddings_array = np.array(document_embeddings)

# Save document chunks and embeddings for later use
np.save('document_chunks.npy', document_texts)
np.save('document_embeddings.npy', document_embeddings_array)

index = faiss.IndexFlatL2(document_embeddings_array.shape[1])
index.add(document_embeddings_array)
faiss.write_index(index, 'faiss_index.index')

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

index = faiss.read_index('faiss_index.index')
with open('document_chunks.npy', 'rb') as f:
    document_texts = np.load(f, allow_pickle=True)

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def query_knowledge_base(query, top_n=3):
    query_embedding = embedding_model.encode(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_n)
    results = sorted([(document_texts[idx], distances[0][i]) for i, idx in enumerate(indices[0])], key=lambda x: x[1])
    print("Top relevant document chunks:")
    for i, (doc, score) in enumerate(results):
        print(f"Rank {i+1}:\nDocument: {doc}\nScore: {score}\n")
    top_documents = " ".join([doc for doc, _ in results])
    prompt = f"Based on the following information:\n{top_documents}\n\nPlease provide a detailed and conversational answer to the question: {query}"
    result = pipe(prompt, max_length=200)
    return result[0]['generated_text']

if __name__ == "__main__":
    query = input("Enter your query: ")
    answer = query_knowledge_base(query, top_n=3)
    print("Answer:", answer)




tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Enter your query: what is nlp
Top relevant document chunks:
Rank 1:
Document: Agenda
●What is AI?  
●Neural Networks
●Natural L anguage P rocessing (NLP)
●Computer Vision
Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited
Score: 0.9406765699386597

Rank 2:
Document: Natural Language Processing
Natural Language Processing is:
•The application of computational 
linguistics to build real-world 
applications which work with languages comprising of varying structures 
•Used to understand and interpret human language to the machine.
Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited
Score: 1.0421764850616455

Rank 3:
Document: multiple languages.Natural Language Processing
Proprietary content. ©Great Learning. All Rights Reserved. Unauthorized use or distribution prohibited
Score: 1.1019583940505981

Answer: nlp is a software package that allows users to create and modify a network of nlp-

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

index = faiss.read_index('faiss_index.index')

with open('document_chunks.npy', 'rb') as f:
    document_texts = np.load(f, allow_pickle=True)

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def query_knowledge_base(query, top_n=3):

    query_embedding = embedding_model.encode(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_n)
    results = [(document_texts[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    top_documents = " ".join([doc for doc, _ in results])

    result = pipe(f"Based on the following documents: {top_documents}\nAnswer the question: {query}")

    return result[0]['generated_text']


In [None]:
k = True
while k:

    if __name__ == "__main__":
        query = input("Enter your query:(type '1' to exit the chat)")
        if(query == "1"):
          k = False
          break
        answer = query_knowledge_base(query, top_n=3)
        print("Answer:", answer)

Enter your query:(type '1' to exit the chat)what are uses of nlp
Answer: •The application of computational linguistics to build real-world applications which work with languages compris
Enter your query:(type '1' to exit the chat)what is computer vision
Answer: an overarching term
Enter your query:(type '1' to exit the chat)uses of computer vision
Answer: Digital photography, videos, displays etc.
Enter your query:(type '1' to exit the chat)what are neural networks
Answer: Artificial neuron is inspired by biological neuron
Enter your query:(type '1' to exit the chat)1
