In [1]:
print('OK')

OK


In [4]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import os
# from PyPDF2 import PdfReader

In [5]:
# Recursive text splitter function
def recursive_text_split(text, chunk_size=1000, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = overlap)
    return splitter.split_text(text)

In [7]:
# Function to process the uploaded file based on its type
def process_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    chunks = []

    # handle different file types
    if ext == '.txt':
        # Handle text files
        with open(file_path,'r', encoding='utf-8') as file:
            text = file.read()
        chunks = recursive_text_split(text)

    elif ext == '.pdf':
        # Handle PDF files
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        chunks = recursive_text_split(text)
    
    elif ext == '.csv':
        # Handle CSV files
        df = pd.read_csv(file_path)
        text = df.to_string()
        chunks = recursive_text_split(text)

    elif ext in ['.py','.java','.cpp','.c','.js','.html','.css','.r','.kt',',sh']:
        # Handle programming files
        with open(file_path, 'r', encoding='utf-8') as file:
            code = file.read()
        chunks = recursive_text_split(code)

    elif ext in ['.json','.xml','.yml','.yaml']:
        # Handle data files like JSON, XML, YAML
        with open(file_path, 'r',encoding='utf-8') as file:
            data = file.read()
        chunks = recursive_text_split(data)
    else:
        return f"Unsupported file format: {ext}"
    
    return chunks

In [8]:
# Load document using PyPDFLoader document loader
loader = PyPDFLoader("/home/data04/Desktop/Llama3_RAG_Pipeline/data/data.pdf")
documents = loader.load()

In [9]:
pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Splitting the data into chunks
from langchain.text_splitter import CharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=50,separators="\n")
docs = text_splitter.split_documents(documents=documents)

In [11]:
docs[0]

Document(metadata={'source': '/home/data04/Desktop/Llama3_RAG_Pipeline/data/data.pdf', 'page': 0}, page_content='The Reserve Bank of India’s\nBalance Sheet:\n Analytics and Dynamics\nof Evolution\nNarendra Jadhav, Partha Ray, Dhritidyuti Bose and\nIndranil Sen Gupta*\nThe present paper attempts to contribute to the growing literature on central bank\nbalance sheets drawing on a case-study of the Indian experience. The analytical commentary\non the evolution of the Reserve Bank of India Balance Sheet in relation to the post-\nIndependence national macroeconomic experience is partitioned into three phases on the')

In [12]:
pip install sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [13]:
# download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [14]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [15]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [16]:
query_result = embeddings.embed_query("Hello world")
len(query_result)

384

In [17]:
query_result

[-0.034477315843105316,
 0.031023172661662102,
 0.006734910886734724,
 0.02610892429947853,
 -0.03936195746064186,
 -0.1603025197982788,
 0.06692396104335785,
 -0.006441440898925066,
 -0.04745054617524147,
 0.014758836477994919,
 0.07087532430887222,
 0.055527545511722565,
 0.01919332519173622,
 -0.026251299306750298,
 -0.01010951679199934,
 -0.026940451934933662,
 0.022307397797703743,
 -0.022226639091968536,
 -0.1496926248073578,
 -0.01749303936958313,
 0.007676327601075172,
 0.054352276027202606,
 0.0032544792629778385,
 0.03172592446208,
 -0.08462144434452057,
 -0.029405953362584114,
 0.05159562826156616,
 0.048124104738235474,
 -0.003314818488433957,
 -0.05827919766306877,
 0.04196928068995476,
 0.02221069671213627,
 0.12818878889083862,
 -0.02233896404504776,
 -0.011656257323920727,
 0.06292840093374252,
 -0.03287629410624504,
 -0.09122602641582489,
 -0.031175386160612106,
 0.05269954726099968,
 0.047034841030836105,
 -0.08420310169458389,
 -0.030056146904826164,
 -0.020744822919

In [18]:
pip install faiss-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Loading the data and correspond embeddings into the FAISS
vectorstore = FAISS.from_documents(docs, embeddings)

In [20]:
# Persist the vectors locally on disk
vectorstore.save_local("faiss_index_")

In [21]:
# Load from local storage
persisted_vectorstore = FAISS.load_local("faiss_index_",embeddings,allow_dangerous_deserialization=True)

In [22]:
# creating a retriever on top of database
retriever = persisted_vectorstore.as_retriever()

In [23]:
pip install langchain_ollama

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [24]:
from langchain_community.llms import Ollama

In [25]:
# Initialize an instance of the Ollama model
llm = Ollama(model="llama3.1")

In [26]:
response = llm.invoke("Tell me a joke")
print(response)

Here's one:

What do you call a fake noodle?

An impasta.


In [44]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [45]:
from langchain import PromptTemplate
prompt = PromptTemplate(template=prompt_template,
                        input_variables=["context","question"])
chain_type_kwargs = {"prompt": prompt, "verbose": True}

In [52]:
# Use RetrievalQA chain for orchestration
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs # or True if you want detailed output
)



In [54]:
query = "what is Phase of Financial Sector Liberalisation"
qa.run(query)

AttributeError: 'FieldInfo' object has no attribute 'handlers'

In [40]:
while True:
    query = input("Type your querry if you want to exit type Exit: \n")
    if query == "Exit":
        break
    result = qa(query)
    print(query)
    print(result)


AttributeError: 'FieldInfo' object has no attribute 'handlers'

In [33]:
pip install --upgrade langchain

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
