In [38]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain import HuggingFaceHub
import os
from dotenv import load_dotenv
load_dotenv()



True

In [36]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV')

In [3]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf(r'D:\NLP_Project\ChatWithData_HuggingfaceAPI\Data')

In [7]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 244


In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings =download_hugging_face_embeddings()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [12]:
#Initializing the Pinecone
pinecone.init(api_key=PINECONE_API_KEY,
              environment=PINECONE_API_ENV)

index_name="sqlchatbot"

#Creating Embeddings for Each of The Text Chunks & storing
docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [15]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)

query = "What  is  SQL?"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs[0].page_content)

Result major DBMS supports SQL, so learning this one language will enable programmer to 
interact with any database like ORACLE, SQL ,MYSQL etc.  
 
2. SQL is easy to learn . The statements are all made up of descriptive English words, and 
there aren't that many of them.  
 
3. SQL is actually a very powerful language and by using its language elements you can 
perform very complex and sophisticated database o perations .


In [16]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [17]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [35]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.environ.get('HUGGINGFACEHUB_API_TOKEN')

In [24]:
llm=HuggingFaceHub(repo_id="google/flan-t5-large",model_kwargs={"temperature":0.7,"max_length":200})
llm_out=llm("Which is most expensive city in the world?")
print(llm_out)



london


In [30]:
llm1=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",model_kwargs={"temperature":0.6,"max_length":200})
llm1_out=llm1("Which is most expensive city in the world?")
print(llm1_out)





The most expensive city in the world is Singapore.

World's most expensive city for expats is Singapore, which has topped the cost of living index for the fourth consecutive year due to high prices for cars and homes, according to an annual survey by consultancy ECA International.

The survey, which compared the cost of living for expatriates in 470 cities around the world, found that Singapore's overall cost of living is 15 percent


In [31]:
docsearch=Pinecone.from_existing_index(index_name, embeddings)

In [32]:
qa=RetrievalQA.from_chain_type(
    llm=llm1, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [34]:
query =  "What is Join? and giv some examples "
result = qa({"query": query})
print(result['result'])


A Join in SQL is used to combine data from two or more tables based on a related column between them.

For example, consider two tables - Student and StudentCourse.

Student Table
EnrollNo  StudentName  Address
1000  geek1  geeksquiz1
1001  geek2  geeksquiz2
1002  geek3  geeksquiz3

StudentC
