In [None]:
!pip install --user langchain python-dotenv google-generativeai langchain-google-genai langchain-community youtube-transcript-api chromadb==0.5.3

In [None]:
!rm -r chroma_db/chroma_db_pdf
!mkdir chroma_db/chroma_db_pdf

In [None]:
GOOGLE_API_KEY='INSERT THE GOOGLE API KEY'

# Simple Prompt Using Gemini + Langchain

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

# utilize gemini as the model
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

# create query for the model
# the query contains short instructions and a question
# the question is stored in the variable "topic"
prompt = PromptTemplate.from_template("You are a content student of mathematics. Create a simple explanation about {query}")

# create a runnable interface to make query to the model
prompt_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)

In [None]:
query = "how does linear function works"

# make a query to the model
resp = prompt_chain.run(query=query) 
print(resp)

# Retrieval Augmented Generation (RAG)

## Create RAG Prompt Using Gemini + Langchain to Answer Questions Based on Youtube Videos

In [None]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.document_loaders import YoutubeLoader
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.prompt_template import format_document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

In [None]:
def create_retriever_from_youtube_video_url(video_url, k):
    # retrieve the transcript of the youtube video
    loader = YoutubeLoader.from_youtube_url(video_url)
    transcript = loader.load()

    # splits the transcript into several smaller documents, each containing chunks of the transcript
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(transcript)
    
    # embed the documents and store it in a vector database
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
    vectorstore = Chroma.from_documents(
                     documents=docs,                 # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="chroma_db/chroma_db_youtube" # Directory to save data
                     )
    vectorstore_disk = Chroma(
                        persist_directory="chroma_db/chroma_db_youtube",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )
    
    # a vector store retriever to retrieve the embedded documents
    retriever = vectorstore_disk.as_retriever(search_kwargs={"k": k})
    
    return retriever

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def get_response_from_query(retriever, query):
    # retrieve documents that has high similiarity with the given query
    docs = retriever.get_relevant_documents(query)
    
    # create an instance of the gemini model
    llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

    # prompt text for the model 
    prompt = PromptTemplate(
        input_variables=["query", "docs"],
        template="""
        You are a helpful assistant that can answer questions about youtube videos based on the video's transcript.
        
        Answer the following question: {query}
        By searching the following video transcript: {docs}
        
        Only use the factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """
    )
    
    # chain of steps to make rag prompt
    rag_chain = (
        {"docs": retriever | format_docs, # documents used inside the prompt are acquired from the retriever and formatted with the funcrion "format_docs"
         "query": RunnablePassthrough()} # the query is provided in the input through applying the method of "invoke"
        | prompt
        | llm
        | StrOutputParser()
    ) 
    
    # make prompt based on the chain created
    resp = rag_chain.invoke(query)

    return resp, docs

In [None]:
video_url = 'https://www.youtube.com/watch?v=yLsygYydHas'
k = 1

retriever = create_retriever_from_youtube_video_url(video_url, k)

In [None]:
resp, docs = get_response_from_query(retriever, 'Tell me something about mathematics')
print(resp, '\n')
[print(d, '\n') for d in docs]

In [None]:
resp, docs = get_response_from_query(retriever, "Why does the house of the dragon showrunner talks about daemon's visions")
print(resp, '\n')
[print(d, '\n') for d in docs]

## Create RAG Prompt Using Gemini + Langchain to Answer Questions Based on CSV Files

In [None]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_community.document_loaders import CSVLoader
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

In [None]:
def create_retriever_from_csv_files(csv_file_path, k):
    # load csv file and stored it in a variable
    loader = CSVLoader(file_path=csv_file_path)
    docs = loader.load()
    
    # embed the documents and store it in a vector database
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
    vectorstore = Chroma.from_documents(
                         documents=docs,                 # Data
                         embedding=gemini_embeddings,    # Embedding model
                         persist_directory="chroma_db/chroma_db_csv_files" # Directory to save data
                         )
    vectorstore_disk = Chroma(
                        persist_directory="chroma_db/chroma_db_csv_files",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )

    # a vector store retriever to retrieve the embedded documents
    retriever = vectorstore_disk.as_retriever(search_kwargs={"k": k})
    
    return retriever

def format_docs(docs):
    return "\n\n".join(d.page_content.split('Answer: ')[1] for d in docs)

def get_response_from_query(retriever, query):
    # retrieve documents that has high similarity with the given query
    docs = retriever.get_relevant_documents(query)
    
    # create an instance of the gemini model
    llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

    # prompt text for the model 
    prompt = PromptTemplate(
        input_variables=["query", "docs"],
        template="""
        You are a helpful assistant that can answer questions about Ariabagus's profile
        based on the informations provided by Ariabagus.
        
        Answer the following question: {query}
        By searching through these informations: {docs}
        
        Only use the factual information from the provided informations to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """
    )

    # chain of steps to make rag prompt
    rag_chain = (
        {"docs": retriever | format_docs, "query": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # make prompt based on the chain created
    resp = rag_chain.invoke(query)

    return resp, docs

In [None]:
retriever = create_retriever_from_csv_files('train_data.csv', 3)

In [None]:
resp, docs = get_response_from_query(retriever, 'What is your name?')
print(resp, '\n')
[print(d, '\n') for d in docs]

In [None]:
resp, docs = get_response_from_query(retriever, 'Which university do you go for study?')
print(resp, '\n')
[print(d, '\n') for d in docs]

In [None]:
resp, docs = get_response_from_query(retriever, 'What is your name and at which university do you go for study?')
print(resp, '\n')
[print(d, '\n') for d in docs]

## Create RAG Prompt Using Gemini + Langchain to Answer Questions Based on Data From HuggingFace

In [None]:
from tqdm.notebook import tqdm
from datasets import load_dataset
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.docstore.document import Document as LangchainDocument
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

In [None]:
def transform_dataset_into_proper_format(row):
    instruction = row['instruction']
    input_val = row['input']
    output_val = row['output']

    reformatted_prompt = f"""Instruction:{instruction}\n\nInput:{input_val}\n\nOutput:{output_val}"""
    
    return {'prompt': reformatted_prompt}

def convert_huggingface_data_to_documents(dataset_name):
    # Load dataset
    dataset = load_dataset(dataset_name, split="train").shuffle(seed=42) \
                                                            .select(range(1000)) \
                                                            .map(transform_dataset_into_proper_format)
    
    # convert data into documents type of data
    docs = [LangchainDocument(page_content=doc["prompt"], metadata={"index": i}) for i, doc in enumerate(tqdm(dataset))]
                                                                    
    return docs

def create_retriever_from_documents(docs, k):
    # embed the documents and store it in a vector database
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
    vectorstore = Chroma.from_documents(
                         documents=docs,                 # Data
                         embedding=gemini_embeddings,    # Embedding model
                         persist_directory="chroma_db/chroma_db_huggingface" # Directory to save data
                         )
    vectorstore_disk = Chroma(
                        persist_directory="chroma_db/chroma_db_huggingface",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )

    # a vector store retriever to retrieve the embedded documents
    retriever = vectorstore_disk.as_retriever(search_kwargs={"k": k})
    
    return retriever

def format_docs(docs):
    return "\n\n".join(d.page_content.split('Output:')[1] for d in docs)

def get_response_from_query(retriever, query):
    # retrieve documents that has high similarity with the given query
    docs = retriever.get_relevant_documents(query)
    
    # create an instance of the gemini model
    llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

    # prompt text for the model 
    prompt = PromptTemplate(
        input_variables=["query", "docs"],
        template="""
        You are a helpful assistant that can generate a Python code to solve certain task
        utilizing the given Python codes that solves similar task
        
        Generate a Python code that solves this task: {query}
        Utilizing informations acquired from these Python Codes: {docs}
        
        Utilize these informations to develop the desired Python code
        
        If you feel like you don't have enough information to answer the question, say "I don't know"
        """
    )

    # chain of steps to make rag prompt
    rag_chain = (
        {"docs": retriever | format_docs, "query": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # make prompt based on the chain created
    resp = rag_chain.invoke(query)

    return resp, docs

In [None]:
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
docs = convert_huggingface_data_to_documents(dataset_name)
retriever = create_retriever_from_documents(docs, 5)

In [None]:
query = 'Create a for loop in Python that prints the output of a multiplication table for numbers from 1 to 10.'
resp, docs = get_response_from_query(retriever, query)
print(resp, '\n')
[print(d, '\n') for d in docs]

## Create RAG Prompt Using Gemini + Langchain to Answer Questions Based on Data From PDF File

In [None]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from langchain.schema import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema.prompt_template import format_document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

In [None]:
def create_retriever_from_pdf_file(pdf_filename, k):
    loader = PyPDFLoader(pdf_filename)
    pages = loader.load_and_split()
    
    # splits the texts into several smaller documents, each containing chunks of the texts
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(pages)
    
    # embed the documents and store it in a vector database
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
    vectorstore = Chroma.from_documents(
                     documents=docs,                 # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="chroma_db/chroma_db_pdf" # Directory to save data
                     )
    vectorstore_disk = Chroma(
                        persist_directory="chroma_db/chroma_db_pdf",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )
    
    # a vector store retriever to retrieve the embedded documents
    retriever = vectorstore_disk.as_retriever(search_kwargs={"k": k})
    
    return retriever

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def get_response_from_query(retriever, query):
    # retrieve documents that has high similiarity with the given query
    docs = retriever.get_relevant_documents(query)
    
    # create an instance of the gemini model
    llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)

    # prompt text for the model 
    prompt = PromptTemplate(
        input_variables=["query", "docs"],
        template="""
        You are a helpful assistant that can answer questions about machine learning.
        
        Answer the following question: {query}
        By searching the following informations: {docs}
        
        Only use the factual information from the given information to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        Your answers should be verbose and detailed.
        """
    )
    
    # chain of steps to make rag prompt
    rag_chain = (
        {"docs": retriever | format_docs, # documents used inside the prompt are acquired from the retriever and formatted with the funcrion "format_docs"
         "query": RunnablePassthrough()} # the query is provided in the input through applying the method of "invoke"
        | prompt
        | llm
        | StrOutputParser()
    ) 
    
    # make prompt based on the chain created
    resp = rag_chain.invoke(query)

    return resp, docs

In [None]:
pdf_filename = "FILENAME"
retriever = create_retriever_from_pdf_file(pdf_filename, 3)

In [None]:
resp, docs = get_response_from_query(retriever, 'What is machine learning')
print(resp, '\n')
[print(d, '\n') for d in docs]