In [1]:
import os 
import openai
%pip install -U openai==0.28.1 

from langchain.chat_models import ChatOpenAI
#instantiate a chat
llm = ChatOpenAI()
#llm.predict('How are you?')

#check version 
openai.__version__
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

api_key=os.getenv('OPENAI_API_KEY') #set as an environmental variable
#openai.api_key=('personal key')
#Load credits

In [2]:
#pip install langchain==0.0.331 --ignore-installed PyYAML
import langchain 
langchain.__version__
from langchain.document_loaders import PyPDFLoader
from langchain.chains import ConversationChain 

#prompt a new chained conversation with the LLM
chat_model=ChatOpenAI()

chain=ConversationChain(
    llm=chat_model,
    verbose=True
)

chain.run("How are you today?")

chain.run("What was my current conversation?")

Prompt Templates

In [None]:
from langchain.prompts import PromptTemplate

template="""
Return all the subcategories of the following category

{category}
"""

prompt= PromptTemplate(
    input_variables=["category"],
    template=template
)

prompt

In [None]:
from langchain.chains import LLMChain 
from langchain.prompts import (
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate, 
    ChatPromptTemplate
)

system_template = """
You are a helpful assistant who generates comma separated lists.
The user will only pass a category and you should generate subcategories
ONLY return comma separated and nothing more!
"""

human_template='{category}'

system_message=SystemMessagePromptTemplate.from_template(system_template)

human_message=HumanMessagePromptTemplate.from_template(
    human_template
)

prompt=ChatPromptTemplate.from_messages([
    system_message, human_message
])

chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    verbose=True
)

chain.run("Machine Learning")

Output parser

In [None]:
from langchain.schema import BaseOutputParser

class CommaSeparatedParser(BaseOutputParser):
    def parse(self,text):
        #remove whitespaces, and split strings with commas inbetween
        output=text.strip().split(',')
        output=[o.strip() for o in output]
        return output

#the output should be a more clean list
chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    output_parser=CommaSeparatedParser(),
    verbose=True
)

input_list=[
    {'category':'food'},
    {'category':'country'},
    {'category':'colors'}
]

#chained response of categories option
response=chain.apply(input_list)

Simple Sequence

In [None]:
title_template="""
You are a writer. Given a subject, your job is to return a fun title for a play

Subject: {subject}
Title:"""

title_chain=LLMChain.from_string(
    llm=chat_model,
    template=title_template
)

title_chain.run("Machine Learning")

In [None]:
synopsis_template="""
You are a writer.
Given a title, write a synopsis for a play.

Title: {title}
Synopsis:
"""

synopsis_chain=LLMChain.from_string(
    llm=chat_model,
    template=synopsis_template
)

#input the outputs of the original title into the synposis chain
title="Generated Title"

synopsis_chain.run(title)

In [None]:
#Sequential chain version
from langchain.chains import SimpleSequentialChain

chain=SimpleSequentialChain(
    chains=[title_chain, synopsis_chain],
    verbose=True
)

chain.run("Machine Learning")


Summarizing

In [None]:
print(chain.refine_llm_chain.prompt.template)

initial_template="""
Extract the most relevant themes from the following:

"{text}"

THEMES:"""

refine_template="""
Your job is to extract the most relevant themes
We have provided an existing list of themes up to a certain point: {existing_answer}
We have the opportunity to refine the existing list (only if needed) with some context below
------------
{text}
------------
Given the new context, refine the original list
If the context isn't useful, return the original list and ONLY the original list.
Return that list as a comma separated list.

LIST:"""

initial_prompt=PromptTemplate.from_template(initial_template)
refine_prompt=PromptTemplate.from_template(refine_template)

In [None]:
#summarizing data sources
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI 

llm=ChatOpenAI()
chain=load_summarize_chain(
    llm=llm,
    chain_type="refine", #use "map_reduce" for more complex documents
    question_prompt=initial_prompt,
    refine_prompt=refine_prompt,
    verbose=True
)

%pip install langchain openai tqdm jq unstructured pypdf tiktoken 

from langchain.document_loaders import (
    UnstructuredCSVLoader,
    UnstructuredHTMLLoader,
    UnstructuredImageLoader,
    PythonLoader,
    PyPDFLoader,
    JSONLoader
)

from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DirectoryLoader 

#file_path='csv file path'
csv_loader=CSVLoader(file_path=file_path)
compiled_data=csv_loader.load()
compiled_data[0].page_content 

In [12]:
#load documents
file_path="/Users/anthonychen/Desktop/Big data and social science a practical guide to methods and tools by Ian Foster, Rayid Ghani, Ron S. Jarmin, Frauke Kreuter, Julia Lane.pdf"
sl_loader=PyPDFLoader(file_path)
#split into chunks for LLM processing
data_chunks=loader.load_and_split() #uses recursive character text splitter

In [None]:
#map reduce strategy for large amounts of data for LLM to process
#or refine chunks with separate summaries & combine at the end

from langchain.text_splitter import(
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter
)

splitter1=CharacterTextSplitter(
    chunk_size=1000, #1k characters
    chunk_overlap=0, 
)

#more chunks than previous:
splitter2=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)

s1_data1=sl_loader.load_and_split(text_splitter=splitter1)
s1_data2=sl_loader.load_and_split(text_splitter=splitter2)


In [None]:
#summary works better on csv file (when unrefined)
#chain.run(compiled_data[:5])
chain.run(sl_data1[:20])

In [None]:
#folder_path=""
mixed_loader=DirectoryLoader(
    path=folder_path,
    use_multithreading=True,
    show_progress=True
)

mixed_data=mixed_loader.load_and_split()

In [13]:
pages=loader.load()

In [14]:
len(pages)

377

In [17]:
page=pages[200]
print(page.page_content[:500])

180 6. M achine Learning
the expected value of the predictions of a classiﬁer and select t he
model that optimizes this cost-sensitive metric.
6.7 Practical tips
Here we highlight some practical tips that will be helpful when w ork-
ing with machine learning methods.
6.7.1 Features
So far in this chapter, we have focused a lot on methods and pro-
ces
s, and we have not discussed features in detail. In social science,
they are not called features but instead are known as variables or
predictors. 


In [19]:
page.metadata

{'source': '/Users/anthonychen/Desktop/Big data and social science a practical guide to methods and tools by Ian Foster, Rayid Ghani, Ron S. Jarmin, Frauke Kreuter, Julia Lane.pdf',
 'page': 200}

In [3]:
#Youtube
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [4]:
#brew install ffprobe and ffmpeg; youtube set rate limits
url="https://www.youtube.com/watch?v=Ffl8b_GfJ-M&ab_channel=TobiasFischer"
save_dir="/Users/anthonychen/Documents/Youtube/"
loader=GenericLoader(YoutubeAudioLoader([url],save_dir), OpenAIWhisperParser())
docs=loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=Ffl8b_GfJ-M&ab_channel=TobiasFischer
[youtube] Ffl8b_GfJ-M: Downloading webpage
[youtube] Ffl8b_GfJ-M: Downloading ios player API JSON
[youtube] Ffl8b_GfJ-M: Downloading android player API JSON
[youtube] Ffl8b_GfJ-M: Downloading m3u8 information
[info] Ffl8b_GfJ-M: Downloading 1 format(s): 140
[download] /Users/anthonychen/Documents/Youtube//Stable Diffusion Consistent Character Animation Technique - Tutorial.m4a has already been downloaded
[download] 100% of   31.65MiB
[ExtractAudio] Not converting audio /Users/anthonychen/Documents/Youtube//Stable Diffusion Consistent Character Animation Technique - Tutorial.m4a; file is already in target format m4a
Transcribing part 1!


RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [6]:
#website chatting
from langchain.document_loaders import WebBaseLoader
loader=WebBaseLoader("https://adriangcoder.medium.com/pandas-tricks-and-tips-a7b87c3748ea")
docs=loader.load()

In [7]:
print(docs[0].page_content[:500]) #need to do postprocessing on info to get workable format

Pandas for time series data — tricks and tips | by Adrian G | MediumPandas for time series data — tricks and tipsAdrian G·Follow7 min read·Oct 24, 2018--2ListenShareThere are some Pandas DataFrame manipulations that I keep looking up how to do. I am recording these here to save myself time. These may help you too.Time series dataConvert column to datetime with given formatdf[‘day_time’] = pd.to_datetime(df[‘day_time’], format=’%Y-%m-%d %H:%M:%S’)0 2012–10–12 00:00:001 2012–10–12 00:30:002 2012–1


In [8]:
#notion databases
from langchain.document_loaders import NotionDirectoryLoader
loader=NotionDirectoryLoader("docs/Notion_DB")
docs=loader.load()

In [9]:
print(docs[0].page_content[0:200])

IndexError: list index out of range

**Few-Shot Learning**

As you feed the model 10 or more examples, the accuracy improves drastically

In [None]:
from langchain.prompts import (
    FewShotChatMessagePromptTemplate,
    ChatPromptTemplate,
)

#insert a few examples
examples=[
    {"input":"" , "output":""},
    {"input":"" , "output":""},
    {"input":"" , "output":""}
]

example_prompt=ChatPromptTemplate.from_messages(
    [
        ("human","{input}"),
        ("ai","{output}")
    ]
)

few_shot_prompt=FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

print(few_shot_prompt.format())


In [None]:
final_prompt=ChatPromptTemplate.from_messages(
    [
        ("system", "You are wonderous wizard of math."),
        few_shot_prompt,
        ("human", "{input}"),
    ]
)

#Input Question
print(final_prompt.format(input=''))

In [None]:
chain=LLMChain(
    llm=chat_model,
    prompt=final_prompt,
    verbose=True
)

chain.run()

**Memetic Proxy**

In [None]:
from langchain.prompts import PromptTemplate

template="""
System: (reference)
Provide a helpful answer to the following question:

Human: {question}

AI:
"""

prompt=PromptTemplate.from_template(template)

chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    verbose=True
)

high_level="Imagine you are a Professor teaching at the PhD level"
lower_level="Imagine you are a kindergarten teacher"

question="Explain Quantum Mechanics"

chain.run(
    {
        'question':question,
        'reference':high_level
    }
)


Indexing Data

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings=OpenAIEmbeddings(show_progress_bar=True)

vector1=embeddings.embed_query('How are you?')

#embeddings.__dict__

In [None]:
import numpy as np
from numpy.linalg import norm 

def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

vector1=embeddings.embed_query('machine learning')
vector2=embeddings.embed_query('artificial intelligence')
cosine_sim=get_cosine(vector1, vector2)


In [None]:
from langchain.vectorstores import FAISS 

#choose document data to load
index=FAISS.from_documents(data, embeddings)

index.similarity_search_with_relevance_scores("What is machine learning?")

Vector Database retrieval

Question converted to embedding, then search within index (created with Pinecone) of the vector database to get nearest neighbors, pass them into a prompt, provide prompt to LLM and get answer to user

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

retriever=index.as_retriever()
#number of vectors to retrieve
retriever.search_kwargs['fetch_k']=20
#diversify the information provided to LLM
retriever.search_kwargs['maximal_marginal_relevance']=True
#final number of data context vectors provided
retriever.search_kwargs['k']=10

llm=ChatOpenAI()

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

chain.run("What is machine learning?",
          callbacks=[handler])

Load data into vector database

In [None]:
%pip install pinecone-client

In [None]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

#name originally input in pinecone
index_name=''
db=Pinecone.from_documents(
    data, #context provided from document loader
    embeddings,
    index_name=index_name
)

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

chain.run("What is machine learning?",
          callbacks=[handler])

Display sources for LLM output

In [None]:
%pip install newsapi-python

In [None]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi=NewsApiClient(api_key=NEWS_API_KEY)
today=date.today()
last_week=today-timedelta(days=7)

#dictionary of responses 
latest_news=newsapi.get_everything(
    q='artifical intelligence',
    from_param=last_week.strftime('%Y-%m-%d'),
    to=today.strftime('%Y-%m-%d'),
    sort_by='relevancy',
    language='en'
)

In [None]:
from langchain.docstore.document import Document 
docs=[
    Document( 
    page_content=article['title']+'\n\n'+article['description'],
    metadata={
        'source':article['url'],
    }
        , ) for article in latest_news['articles']
]

In [None]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

qa_chain=create_qa_with_sources_chain(llm)

doc_prompt=PromptTemplate(
    template='Content: {page_content}\nSource:{source}',
    input_variables=['page_content','source'],
)

final_qa_chain=StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index=FAISS.from_documents(docs, )

chain=RetrievalQA(
    retriever=index.as_retriever(),
    combine_documents_chain=final_qa_chain
)



In [None]:
question="""What is the most important news about artificial intelligence in the last week?"""

answer=chain.run(question)

print(answer)

Indexing from a website

In [None]:
%pip install apify-client chromadb #webcrawler & local vector db 

In [None]:
from langchain.utilities import ApifyWrapper
from langchain.document_loeaders.base import Document 

apify=ApifyWrapper()

loader=apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls":[{"url":""}], #insert url 
        "aggressivePrune":True,
    },
    dataset_mapping_function=lambda item: Document(
        page_content=item['text'] or "", metadata={"source":item['url']}
    ),
)

In [None]:
from langchain.indexes import VectorstoreIndexCreator

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

index=VectorstoreIndexCreator(
    text_splitter=text_splitter
).from_loaders([loader])

index

In [None]:
query="What is the main subject of this ..."

index.query_with_sources(query)

In [None]:
#retriever=index.vectorstore.as_retriever()

Indexing GitHub Repo

In [None]:
from langchain.document_loaders import GitLoader 

loader=GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./data/repo/",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch='master',
)

documents=loader.load()

In [None]:
from langchain.text_splitter import Language

python_splitter=RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1000,
    chunk_overlap=200,
)

documents=python_splitter.split_documents(documents)

In [None]:
index=FAISS.from_documents(documents, embeddings)
retriever=index.as_retriever()

#distance metric
retriever.search_kwargs['distance_metric']='cos'
#number of vectors to retrieve
retriever.search_kwargs['fetch_k']=200
#diversify the information provided to LLM
retriever.search_kwargs['maximal_marginal_relevance']=True
#final number of data context vectors provided
retriever.search_kwargs['k']=10

qa=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever 
)

Stuff Chain

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

index=Chroma.from_documents(
    docs,
    embeddings=OpenAIEmbeddings()
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

llm=ChatOpenAI()

#map_rerank returns answer with highest score
chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=index.as_retriever(),
    chain_type='stuff', #map-reduce, refine,map_rerank
    verbose=True
)

chain.run(
    '?', #insert Question
    callbacks=[StdOutcallbackHandler()]
)

RAG Optimization and Multimodal RAG

In [None]:
%pip install -U unstructured-inference onnx pyyesseract python-poppler chromadb

In [None]:
#multivector retrieval
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path=''
loader=PyPDFLoader(file_path=file_path)

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=0
)

sl_data=loader.load_and_split(text_splitter=text_splitter)
sl_data

In [None]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

#store vectors
vectorstore=Chroma(
    collection_name="statistical_learning",
    embedding_function=OpenAIEmbeddings()
)

#store data
store=InMemoryStore()
id_key='doc_id'

retriever=MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
import uuid
#create unique ids for each document in the data
doc_ids=[str(uuid.uuid4()) for _ in sl_data]

In [None]:
#10k character semantic information is very diluted
#solution: break into smaller 
child_text_splitter=RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs=[]
for i, doc in enumerate(sl_data):
    doc_id=doc_ids[i]
    sub_docs=child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        #sub documents get parent id key
        sub_doc.metadata[id_key]=doc_id
    all_sub_docs.extend(sub_docs)


In [None]:
#pass smaller documents
retriever.vectorstore.add_documents(all_sub_docs)
#pass parent documents
retriever.docstore.mset(list(zip(doc_ids, sl_data)))


In [None]:
#enter text or topic 
retriever.vectorstore.similarity_search("")

#get more relevant documents that are more informative
retriever.get_relevant_documents("")

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm=ChatOpenAI(temperature=0)

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

#insert question
chain.run("")

Hypothetical questions for RAG

In [None]:
from langchain.chains import LLMChain
from langchain.output_parsers import NumberedListOutputParser

prompt="""
Generate a numbered list of 3 hypothetical questions that the below document could be used to answer:

{doc}
"""

llm=ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-16k')

chain=LLMChain.from_string(
    llm=llm,
    template=prompt
)

chain.verbose=True
chain.output_parser=NumberedListOutputParser()

#pick and example
chain.run(sl_data[20].page_content)

Parsing a multimodal document

In [None]:
vectorstore=Chroma(
    collection_name="hypo-questions",
    embedding_function=OpenAIEmbeddings()
)

store=InMemoryStore()
id_key='doc_id'

retriever=MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

doc_ids=[str(uuid.uuid4()) for _ in sl_data]

In [None]:
question_docs=[]
for i, doc in enumerate(sl_data):
    result=chain.run(doc.page_content)
    question_docs.extend([
        Document(
            page_contents=s,
            metadata={id_key: doc_ids[i]}
        ) for s in result
    ])

In [None]:
retriever.vectorstore.add_documents(question_docs)
retriever.docstore.mset(list(zip(doc_ids, sl_data)))

In [None]:
retriever.vectorstore.similarity_search("")

In [None]:
llm=ChatOpenAI(temperature=0)

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

#insert question
chain.run("")

Summarizing Data

In [None]:
prompt="""
You are an assistant tasked with summarizing tables and text
Give a concise summary of the table or text.

Table or text chunk: {element}
"""

model=ChatOpenAI(temperature=0, model_name='gpt-4')
summarize_chain=LLMChain.from_string(
    llm=model,
    template=prompt
)

In [None]:
table_summaries=summarize_chain.batch(table_elements)
text_summaries=summarize_chain.batch(text_elements)

**Describing images with Llava**

LLAMA model merged with llava models 

- cd llama.cpp 

- mkdir build && cd build && cmake

- cmake --build 

In [None]:
#git clone https://github.com/ggerganov/llama.cpp.git
#pip install git-lfs or brew install
#git lfs install 
#git clone https://huggingface.co/mys/ggml_llava-v1.5-7b
#brew install cmake

%%bash
#Define directory containing images
IMG_DIR=''
TEXT_DIR=''

#loop through each image in directory
for img in "${IMG_DIR}"*.jpg: do
    base_name=$(basename "img" .jpg)

    output_file="${TEXT_DIR}${base_name}.txt"

    #model binaries
    ~m 'output file path' \
    #file structure
    --mmproj 'path'\
    --temp 0.1 'path'\
    -p "Describe image in detail. Be specific about graphs, such as bar plots." \
    --image "$img" > "output_file" \
done

In [None]:
import glob
from PIL import Image

text_path=""
images_path=""

text_list=sorted(glob.glob(text_path + "*.txt"))
img_list=sorted(glob.glob(images_path+'*.jpg'))

logging_header="clip_model_load: total allocated memory: 201.27 MB\n\n"
appendix="main: image encoded in"

img_summaries=[]
for i, text_path in enumerate(text_list):
    with open(text_path, 'r') as file:
        summary=file.read()

    summary=summary.split(logging_header, 1)[1].strip()
    summary=summary.split(appendix, 1)[0].strip()

    img_path=img_list[i]
    img=Image.open(img_path)

    img_summaries.append({
        'summary':summary,
        "image":img
    })

In [None]:
from IPython.display import display

for img_dict in img_summaries:
    display(img_dict['image'])
    print(img_dict['summary'])

**Multimodal RAG Pipeline**

option 1:

Multimodal embedding generation (text or images). Ex: If GPT-4 allowed images. 

option 2:

Convert different images into text descriptions then pass into multimodal LLM


option 3 (available): 

Image converted to text to then answer questions using LLM

In [None]:
#index data into database

def get_docs(text_list, ids):
    return [
        Document(
            page_content=s,
            metadata={id_key: ids[i]}
        ) for i, s in enumerate(text_list)
    ]

doc_ids=[str(uuid.uuid4()) for _ in text_summaries]
text_docs=get_docs(
    [t['elements'] for t in text_summaries],
    doc_ids
)

summary_text_docs=get_docs(
    [t['text'] for t in text_summaries],
    doc_ids
)

table_ids=[str(uuid.uuid4()) for _ in table_summaries]
table_docs=get_docs(
    [t['element'] for t in table_summaries],
    table_ids
)

summary_table_docs=get_docs(
    [t['text'] for t in table_summaries], 
    table_ids
)

img_ids=[str(uuid.uuid4()) for _ in img_summaries]
img_summary_docs=get_docs(
    [i['summary'] for i in img_summaries], 
    img_ids
)

In [None]:
vectorstore=Chroma(
    collection_name='llava_pdf',
    embedding_function=OpenAIEmbeddings()
)

store=InMemoryStore()

retriever=MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

retriever.vectorstore.add_documents(summary_text_docs)
retriever.docstore.mset(list(zip(doc_ids, text_docs)))

retriever.vectorstore.add_documents(summary_table_docs)
retriever.docstore.mset(list(zip(table_ids, table_docs))) 

retriever.vectorstore.add_documents(img_summary_docs)
retriever.docstore.mset(list(zip(img_ids, img_summary_docs)))

In [None]:
#insert question
retriever.vectorstore.similarity_search("")

In [None]:
#insert question to compare 
retriever.get_relevant_documents("")

In [None]:
#finalizing the Multimodal RAG
llm=ChatOpenAI(temperature=0)

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

#insert question 
chain.run('')

Augmenting LLM with Graph Database (Benefits: Faster retrieval than raw text)

In [None]:
#load the data 
from langchain.document_loaders import PyPDFLoader
file_path=""
book_loader=PyPDFLoader(file_path=file_path)
#list of documents
book_data=book_loader.load_and_split()
book_data[0].page_content


In [None]:
#creating a graph representation
from langchain.indexes import GraphIndexCreator 
from langchain.llms import OpenAI 

llm=OpenAI(temperature=0)
index_creator=GraphIndexCreator(llm=llm)
graph=index_creator.from_text(book_data[20].page_content)

In [None]:
#knowledge triples (subject, predicate, and object)
graph.get_triples()

In [None]:
from IPython.display import SVG
graph.draw_graphviz(path="book.svg")
SVG('book.svg')

In [None]:
#convert entire book into knowledge graph
graphs=[
    index_creator.from_text(doc.page_content)
    for doc in book_data
]
#this creates many different network entity graphs for each element in the list

In [None]:
#merge different graphs
import networkx as nx
graph_nx=graphs[0]._graph
for g in graphs[1:]:
    graph_nx=nx.compose(graph_nx, g._graph)

In [None]:
from langchain.graphs.networkx_graph import NetworkxEntityGraph

graph=NetworkxEntityGraph(graph_nx)
graph

In [None]:
graph.draw_graphviz(path="graph.pdf", prog='fdp')

In [None]:
#curate knowledge base using langchain

from langchain.chains import GraphQAChain
from langchain.chat_models import ChatOpenAI

llm=ChatOpenAI(temperature=0)

chain=GraphQAChain.from_llm(
    llm=llm,
    graph=graph,
    verbose=True
)

question='Question specific to the document source'

chain.run(question)

In [None]:
from langchain.chains import GraphCypherQAChain
from langchain.chat_models import ChatOpenAI 

#query language for graph db by neo4j
cypher_llm=ChatOpenAI(temperature=0, model_name='gpt-4')
qa_llm=ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo')

chain=GraphCypherQAChain.from_llm(
    cypher_llm=cypher_llm,
    qa_llm=qa_llm,
    graph=graph_db,
    verbose=True,
)


Augmenting LLMs with Agent Tools

Question->Thought (iterative)->Action (uses tool)->Action Input->Observation (Final Answer)

In [1]:
%pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py): started
  Building wheel for wikipedia (setup.py): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=7cc63fba144dd5dd5aa01399c9d76ded2136ff42616abd2e4903677e3cc4b89b
  Stored in directory: c:\users\achen\appdata\local\pip\cache\wheels\a8\ca\f6\a3c8e5e97ce0a0beb22201fb53c8455979ea2ee676c95c9b8b
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain.agents import initialize_agent, AgentType, load_tools
from langchain.chat_models import ChatOpenAI

llm=ChatOpenAI()
tools=load_tools(['wikipedia', 'llm-math'], llm=llm)

agent=initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    return_intermediate_steps=True,
    verbose=True,
)

In [None]:
from langchain.callbacks import StdOutCallbackHandler
question=""
handler=StdOutCallbackHandler()
response=agent(
    {"input":question},
    callbacks=[handler]
)

#can filter with indices: ['input', 'output', 'intermediate_steps']
response

Custom Tools for agents

In [None]:
from langchain.document_loaders import PyPDFLoader

file_path=''

loader=PyPDFLoader(file_path=file_path)
data=loader.load_and_split()

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings=OpenAIEmbeddings()
docsearch=Chroma.from_documents(
    data,
    embeddings,
    collection_name="statistical_learning"
)

In [None]:
from langchain.chains import RetrievalQA 
llm=ChatOpenAI()
chain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

chain.run('question')

In [None]:
from langchain.agents import Tool

description="""
Answer questions about ML. 
Input should be a fully formed question
"""

retrieval_tool=Tool(
    name="ML Knowledge",
    func=chain.run,
    description=description,
)

In [None]:
tools=load_tools(['wikipedia', 'llm-math'], llm=llm)

tools.append(retrieval_tool)

agent=initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    return_intermediate_steps=True,
)

In [None]:
question=''
response=agent(
    {"input":question},
    callbacks=[handler]
)

# **LLM OPs**

In [15]:
import utils
import inspect
#dir(utils)
#print(inspect.getsource(utils.authenticate))

In [18]:
import os
from dotenv import load_dotenv
import random
import json
import base64
import pandas as pd
from pprint import pprint
from sklearn.metrics import train_test_split
from datetime import datetime
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

#needs google api subscription
def authenticate():
    return "DLAI_CREDENTIALS", "DLAI_PROJECT_ID"
    #Load .env
    load_dotenv()
    
    #Decode key and store in .JSON
    SERVICE_ACCOUNT_KEY_STRING_B64 = os.getenv('SERVICE_ACCOUNT_KEY')
    SERVICE_ACCOUNT_KEY_BYTES_B64 = SERVICE_ACCOUNT_KEY_STRING_B64.encode("ascii")
    SERVICE_ACCOUNT_KEY_STRING_BYTES = base64.b64decode(SERVICE_ACCOUNT_KEY_BYTES_B64)
    SERVICE_ACCOUNT_KEY_STRING = SERVICE_ACCOUNT_KEY_STRING_BYTES.decode("ascii")

    SERVICE_ACCOUNT_KEY = json.loads(SERVICE_ACCOUNT_KEY_STRING)


    # Create credentials based on key from service account
    # Make sure your account has the roles listed in the Google Cloud Setup section
    credentials = Credentials.from_service_account_info(
        SERVICE_ACCOUNT_KEY,
        scopes=['https://www.googleapis.com/auth/cloud-platform'])

    if credentials.expired:
        credentials.refresh(Request())
    
    #Set project ID according to environment variable    
    PROJECT_ID = os.getenv('PROJECT_ID')
        
    return credentials, PROJECT_ID

In [19]:
credentials, PROJECT_ID=authenticate()

REGION="us-central1"

In [20]:
import vertexai
from vertexai.language_models import TextGenerationModel 
vertexai.init(project=PROJECT_ID,
              location=REGION,
              credentials=credentials)

In [None]:
from google.cloud import bigquery

bq_client=bigquery.Client(project=PROJECT_ID, credentials=credentials)

In [None]:
#stackoverflow dataset

QUERY_TABLES="""
SELECT table_name
FROM bigquery-public-data.stackoverflow.INFORMATION_SCHEMA.TABLES
"""

query_job=bq_client.query(QUERY_TABLES)

for row in query_job:
    for value in row.values():
        print(value)

In [None]:
#Data Retrieval

INSPECT_QUERY="""
SELECT *
FROM 'bigquery-public-data.stackoverflow.posts_questions' 
LIMIT 3
"""

query_job=bq_client.query(INSPECT_QUERY)

In [None]:
#transform results of query itno arrow table to then put into pandas df
stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
stack_overflow_df.head()

In [None]:
#dealing with large datasets for LLMs

QUERY_ALL=""" 
SELECT *
FROM 'bigquery-public-data.stackoverflow.posts_questions' q
"""

query_job=bq_client.query(QUERY_ALL)

try:
    stack_overflow_df = query_job\
    .result()\
    .to_arrow()\
    .to_pandas()
except Exception as e:
     print('The DataFrame is too large to load into memory.', e)

- When working with (large) data, query optimizing is needed in order to save time and resources.
- Select questions as `input_text` (column 1), answers as `output_text` (column 2).
- Take the questions from `posts_questions` and answers from `posts_answers`.
- Join the questions and their corresponding accepted answers based on their same `unique ID`.
- Making sure the question is about `Python`, and that it `has an answer`. And the date the question was posted is on or after `2020-01-01`
- Limit as 10,000

In [None]:
#query optimization

QUERY=""" 
SELECT CONCAT(q.title, q.body) as input_text, a.body AS output_text
FROM 'bigquery-public-data.stackoverflow.posts_questions' q
JOIN 'bigquery-public-data.stackoverflow.posts_answers' a
ON q.accepted_answer_id=a.id
WHERE q.accepted_answer_id IS NOT NULL AND
    REGEXP_CONTAINS(q.tags, "python") AND 
    a.creation_date >= "2020-01-01"
LIMIT 10000
"""

query_job=bq_client.query(QUERY)

stack_overflow_df = query_job.result()\
                        .to_arrow()\
                        .to_pandas()

Adding instructions improves model performance and generalization to unseen tasks https://arxiv.org/pdf/2210.11416

In [None]:
INSTRUCTION_TEMPLATE = f"""\
Please answer the following Stackoverflow question on Python. \
Answer it like you are a developer answering Stackoverflow questions.

Stackoverflow question:
"""

In [None]:
#adding instruction template to the original input text (questions)
stack_overflow_df['input_text_instruct'] = INSTRUCTION_TEMPLATE + ' '\
    + stack_overflow_df['input_text']

In [None]:
#dataset preparation
train, test=train_test_split(stack_overflow_df, test_size=0.2, random_state=42)

#version controlling
date=datetime.now().strftime("%H:%d:%m:%Y")
cols=['input_text_instruct','output_text']
tune_jsonl=train[cols].to_json(orient="records", lines=True)
training_data_filename=f"train_data_stack_overflow_\
                        python_qa-{date},jsonl"
with open(training_data_filename, "w") as f:
    f.write(tune_jsonl)

tune_jsonl=test[cols].to_json(orient="records", lines=True)
testing_data_filename=f"test_data_stack_overflow_\
                        python_qa-{date},jsonl"
with open(testing_data_filename, "w") as f:
    f.write(tune_jsonl)

Automation with Pipelines

In [24]:
#kubeflow pipelines (best practice to use .output format as inputs from each returned object, including after the return statement)
from kfp import dsl, compiler 
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore",
                        category=FutureWarning,
                        module='kfp.*')

Components & Pipeline

In [None]:
#must specify keyword arguments

@dsl.component 
def preprocessing(datapath: str):
    #preprocessing simplified example
    input_df=pd.read_csv(datapath)
    new_df=input_df.fillna(input_df.mean(), axis=1)
    labels=new_df.iloc[:,-1]
    X_train, X_test, y_train, y_test= train_test_split(new_df, labels, test_size=0.2, random_state=42)
    X_train.to_csv('/path/to/X_train.csv', index=False)
    X_test.to_csv('/path/to/X_test.csv', index=False)
    y_train.to_csv('/path/to/y_train.csv', index=False)
    y_test.to_csv('/path/to/y_test.csv', index=False)
    return dsl.ContainerOp(
        name='pp_task',
        image='...',
        command=['...'],
        file_outputs={
            'X_train': '/path/to/X_train',
            'X_test': '/path/to/X_test',
            'y_train': '/path/to/y_train',
            'y_test': '/path/to/y_test',
        }
    )

@dsl.component
def training_step(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: np.array, y_test: np.array):
    #training
    trained_model=model.fit(X_train, y_train)
    #base_model.save('/path/to/pretrained_model.h5')
    val_pred=model.predict(X_test)
    AUC_score=roc_auc_score(y_test,val_pred)

    with open('/path/to/AUC_result.txt', 'w') as f:
        f.write(str(AUC_score))

    return dsl.ContainerOp(
        name='trained_res',
        image='...',
        command=['...'],
        arguments=['--X_train', X_train, '--X_test', X_test, '--Y_train', y_train, '--Y_test', y_test],
        file_outputs={
            'AUC_score':'/path/to/AUC_result.txt',
            #'pretrained_model':'/path/to/pretrained_model.h5'
        }
    )

@dsl.pipeline
def llm_datapipeline(raw_data_path: str):
    pp_task1=preprocessing(raw_data_path)
    model_score_task2=training_step(X_train: pp_task1.outputs['X_train'], X_test: pp_task1.outputs['X_test'], y_train: pp_task1.outputs['y_train'], y_test: pp_task1.outputs['y_test'])
    return model_score_task2.output['AUC'] #change to outputs if model included


In [None]:
#compile the pipeline

compiler.Compiler().compile(llm_datapipeline, 'llm_datapipeline.yaml')
pipeline_arguments = {
    "raw_data": "/path/to/raw_data",
}

In [None]:
#more in depth
template_path = 'https://us-kfp.pkg.dev/ml-pipeline/\
large-language-model-pipelines/tune-large-model/v2.0.0'

pipeline_arguments = {
    "model_display_name": MODEL_NAME,
    "location": REGION,
    "large_model_reference": "text-bison@001",
    "project": PROJECT_ID,
    "train_steps": TRAINING_STEPS,
    "dataset_uri": TRAINING_DATA_URI,
    "evaluation_interval": EVALUATION_INTERVAL,
    "evaluation_data_uri": EVAUATION_DATA_URI,
}

to view pipeline.yaml file 

In [None]:
#!cat llm_datapipeline.yaml

In [None]:
### import `PipelineJob` 
from google.cloud.aiplatform import PipelineJob

job = PipelineJob(
        ### path of the yaml file to execute
        template_path="pipeline.yaml",
        ### name of the pipeline
        display_name=f"deep_learning_ai_pipeline",
        ### pipeline arguments (inputs)
        ### {"recipient": "World!"} for this example
        parameter_values=pipeline_arguments,
        ### region of execution
        location="us-central1",
        ### root is where temporary files are being 
        ### stored by the execution engine
        pipeline_root="./",
)

### submit for execution
job.submit()

### check to see the status of the job
job.state

```Python
pipeline_root "./"

job = PipelineJob(
        ### path of the yaml file to execute
        template_path=template_path,
        ### name of the pipeline
        display_name=f"deep_learning_ai_pipeline-{date}",
        ### pipeline arguments (inputs)
        parameter_values=pipeline_arguments,
        ### region of execution
        location=REGION,
        ### root is where temporary files are being 
        ### stored by the execution engine
        pipeline_root=pipeline_root,
        ### enable_caching=True will save the outputs 
        ### of components for re-use, and will only re-run those
        ### components for which the code or data has changed.
        enable_caching=True,
)

### submit for execution
job.submit()

### check to see the status of the job
job.state
```

**Deployment & Load Balancing**

In [None]:
model=TextGenerationModel.from_pretrained("text-bison@001")

#route traffic to different endpoints
list_tuned_models=model.list_tuned_model_names()

#randomly select from one of the endpoints to divide prediction load
tuned_model_select=random.choice(list_tuned_models) 

In [None]:
#getting a response. The prompt needs to be similar to the content model was trained on

deployed_model=TextGenerationModel.get_tuned_model(tuned_model_select)
PROMPT='How can I get the max value in a dictionary?'

response=deployed_model.predict(PROMPT)

#the response is stored in a dictionary format
final_output=response._prediction_response[0][0]["content"]

pprint(final_output)

In [None]:
#prompt management templates 

instruct=""" 
Please answer the following StackOverflow question on Python. \
Answer it like \
you are a principal developer answering StackOverflow questions. \
Question:
"""

QUESTION = "How can I store my TensorFlow checkpoint on\
Google Cloud Storage? Python example?"

Prompt=f"""
{instruct}{QUESTION}
""" 

In [None]:
final_response=deployed_model.predict(PROMPT)
output=final_response._prediction_response[0][0]["content"]

pprint(output)

In [None]:
#obtain safety attributes of response

blocked=final_response._prediction_response[0][0]\
['safetyAttributes']['blocked']

print(blocked)

In [None]:
#citations

citation=response._prediction_response[0][0]\
['citationMetadata']['citations']

pprint(citation)

Tuning & Evaluating (BLEU or ROUGE score)

https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/tuning/tuning_text_bison.ipynb