In [2]:
!pip install transformers sentence-transformers langchain langchain-community chromadb pypdf
!pip install torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.llms import HuggingFacePipeline
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import chromadb
import random

In [4]:
# Loads a native language model from the specified model path and creates a pipeline for text generation.
def load_local_llm(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.to('cpu')
    return pipeline('text-generation', model=model, tokenizer=tokenizer, max_new_tokens=200)

In [5]:
llm_pipeline = load_local_llm('C:/Users/badel/Desktop/practicus/llm_model/qwen')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
# Load PDF content
def load_pdf(pdf_path):
    file_path = (pdf_path)
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    return pages

In [7]:
pdf_text = load_pdf('HAVAYOLU FİRMASI.pdf')

In [8]:
# Divides the given text into chunks of a given size and adjusts the overlap between the chunks
def split_text(text, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(text)

In [9]:
text_chunks = split_text(pdf_text)

In [10]:
text_chunks

[Document(metadata={'source': 'HAVAYOLU FİRMASI.pdf', 'page': 0}, page_content="HAVAYOLU FİRMASI:  SkyLink Airlines  \nKURULUŞ YILI:  2000  \nGENEL MERKEZ:  New York, ABD  \n \nHİZMET VERİLEN DESTİNASYONLAR  \nKuzey Amerika: New York, Los Angeles, Chicago, Toronto  \nAvrupa: Londra, Paris, Frankfurt, Roma  \nAsya -Pasifik: Tokyo, Singapur, Hong Kong, Sydney  \nOrta Doğu: Dubai, Doha, Riyad  \n \nFİLO YAPISI  \nSkyLink Airlines'ın modern ve çeşitli bir filosu vardır, toplamda 60 uçak bulunmaktadır. Filo şu \nmodellerden oluşmaktadır:"),
 Document(metadata={'source': 'HAVAYOLU FİRMASI.pdf', 'page': 0}, page_content='modellerden oluşmaktadır:  \nBoeing 787 Dreamliner:  250 koltuk kapasiteli, yakıt verimliliği ve yolcu konforu sa ğlayan \nmodern bir uçak modeli.  \nAirbus A350:  300 koltuk kapasiteli, geniş iç mekânı ve ileri teknoloji özellikleri ile yolculara \nyüksek konfor sunan bir model.  \nBoeing 737:  160 koltuk kapasiteli, kısa ve orta mesafe uçuşlar için ideal olan ekonomik bir \

In [11]:
# Generate embeddings and create vector store
def create_vector_store(chunks, embeddings_model_path):
    embeddings = HuggingFaceEmbeddings(
        model_name=embeddings_model_path,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': False})
     
    db_name = str(random.random())
    vectorstore_pdf = Chroma.from_documents(collection_name=db_name, documents=chunks, embedding=embeddings)
    retriever_pdf = vectorstore_pdf.as_retriever(search_type="similarity", search_kwargs={"k": 5})
    
    return retriever_pdf

In [12]:
retriever_pdf = create_vector_store(text_chunks, r'C:\Users\badel\Desktop\practicus\llm_model\sentence-transformers\all-mpnet-base-v2')

  warn_deprecated(


In [13]:
def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)

In [14]:
def query_pdf(retriever_pdf, llm_pipeline, question):
    
    llm = HuggingFacePipeline(pipeline=llm_pipeline)
    
    prompt =  prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. \nQuestion: {question} \nContext: {context} \nAnswer:"
    )
    
    rag_chain = (
            {"context": retriever_pdf | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
    )

    answer = rag_chain.invoke(question)
    answer = answer.strip().split('Answer:')[-1].strip()
    return answer

In [15]:
answer = query_pdf(retriever_pdf, llm_pipeline, "What is the year the company was founded?")

  warn_deprecated(


In [16]:
answer

'2000\nThe company was founded in 2000.\nThe answer is:\n2000.'