In [1]:
import os 
import openai
%pip install -U openai==0.28.1 

from langchain.chat_models import ChatOpenAI
#instantiate a chat
llm = ChatOpenAI()
#llm.predict('How are you?')

#check version 
openai.__version__
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

api_key=os.getenv('OPENAI_API_KEY') #set as an environmental variable
#openai.api_key=('personal key')
#Load credits

In [2]:
#pip install langchain==0.0.331 --ignore-installed PyYAML
import langchain 
langchain.__version__
from langchain.document_loaders import PyPDFLoader
from langchain.chains import ConversationChain 

#prompt a new chained conversation with the LLM
chat_model=ChatOpenAI()

chain=ConversationChain(
    llm=chat_model,
    verbose=True
)

chain.run("How are you today?")

chain.run("What was my current conversation?")

Prompt Templates

In [None]:
from langchain.prompts import PromptTemplate

template="""
Return all the subcategories of the following category

{category}
"""

prompt= PromptTemplate(
    input_variables=["category"],
    template=template
)

prompt

In [None]:
from langchain.chains import LLMChain 
from langchain.prompts import (
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate, 
    ChatPromptTemplate
)

system_template = """
You are a helpful assistant who generates comma separated lists.
The user will only pass a category and you should generate subcategories
ONLY return comma separated and nothing more!
"""

human_template='{category}'

system_message=SystemMessagePromptTemplate.from_template(system_template)

human_message=HumanMessagePromptTemplate.from_template(
    human_template
)

prompt=ChatPromptTemplate.from_messages([
    system_message, human_message
])

chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    verbose=True
)

chain.run("Machine Learning")

Output parser

In [None]:
from langchain.schema import BaseOutputParser

class CommaSeparatedParser(BaseOutputParser):
    def parse(self,text):
        #remove whitespaces, and split strings with commas inbetween
        output=text.strip().split(',')
        output=[o.strip() for o in output]
        return output

#the output should be a more clean list
chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    output_parser=CommaSeparatedParser(),
    verbose=True
)

input_list=[
    {'category':'food'},
    {'category':'country'},
    {'category':'colors'}
]

#chained response of categories option
response=chain.apply(input_list)

Simple Sequence

In [None]:
title_template="""
You are a writer. Given a subject, your job is to return a fun title for a play

Subject: {subject}
Title:"""

title_chain=LLMChain.from_string(
    llm=chat_model,
    template=title_template
)

title_chain.run("Machine Learning")

In [None]:
synopsis_template="""
You are a writer.
Given a title, write a synopsis for a play.

Title: {title}
Synopsis:
"""

synopsis_chain=LLMChain.from_string(
    llm=chat_model,
    template=synopsis_template
)

#input the outputs of the original title into the synposis chain
title="Generated Title"

synopsis_chain.run(title)

In [None]:
#Sequential chain version
from langchain.chains import SimpleSequentialChain

chain=SimpleSequentialChain(
    chains=[title_chain, synopsis_chain],
    verbose=True
)

chain.run("Machine Learning")


Summarizing

In [None]:
print(chain.refine_llm_chain.prompt.template)

initial_template="""
Extract the most relevant themes from the following:

"{text}"

THEMES:"""

refine_template="""
Your job is to extract the most relevant themes
We have provided an existing list of themes up to a certain point: {existing_answer}
We have the opportunity to refine the existing list (only if needed) with some context below
------------
{text}
------------
Given the new context, refine the original list
If the context isn't useful, return the original list and ONLY the original list.
Return that list as a comma separated list.

LIST:"""

initial_prompt=PromptTemplate.from_template(initial_template)
refine_prompt=PromptTemplate.from_template(refine_template)

In [None]:
#summarizing data sources
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI 

llm=ChatOpenAI()
chain=load_summarize_chain(
    llm=llm,
    chain_type="refine", #use "map_reduce" for more complex documents
    question_prompt=initial_prompt,
    refine_prompt=refine_prompt,
    verbose=True
)

%pip install langchain openai tqdm jq unstructured pypdf tiktoken 

from langchain.document_loaders import (
    UnstructuredCSVLoader,
    UnstructuredHTMLLoader,
    UnstructuredImageLoader,
    PythonLoader,
    PyPDFLoader,
    JSONLoader
)

from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DirectoryLoader 

#file_path='csv file path'
csv_loader=CSVLoader(file_path=file_path)
compiled_data=csv_loader.load()
compiled_data[0].page_content 

In [12]:
#load documents
file_path="/Users/anthonychen/Desktop/Big data and social science a practical guide to methods and tools by Ian Foster, Rayid Ghani, Ron S. Jarmin, Frauke Kreuter, Julia Lane.pdf"
sl_loader=PyPDFLoader(file_path)
#split into chunks for LLM processing
data_chunks=loader.load_and_split() #uses recursive character text splitter

In [None]:
#map reduce strategy for large amounts of data for LLM to process
#or refine chunks with separate summaries & combine at the end

from langchain.text_splitter import(
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter
)

splitter1=CharacterTextSplitter(
    chunk_size=1000, #1k characters
    chunk_overlap=0, 
)

#more chunks than previous:
splitter2=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)

s1_data1=sl_loader.load_and_split(text_splitter=splitter1)
s1_data2=sl_loader.load_and_split(text_splitter=splitter2)


In [None]:
#summary works better on csv file (when unrefined)
#chain.run(compiled_data[:5])
chain.run(sl_data1[:20])

In [None]:
#folder_path=""
mixed_loader=DirectoryLoader(
    path=folder_path,
    use_multithreading=True,
    show_progress=True
)

mixed_data=mixed_loader.load_and_split()

In [13]:
pages=loader.load()

In [14]:
len(pages)

377

In [17]:
page=pages[200]
print(page.page_content[:500])

180 6. M achine Learning
the expected value of the predictions of a classiﬁer and select t he
model that optimizes this cost-sensitive metric.
6.7 Practical tips
Here we highlight some practical tips that will be helpful when w ork-
ing with machine learning methods.
6.7.1 Features
So far in this chapter, we have focused a lot on methods and pro-
ces
s, and we have not discussed features in detail. In social science,
they are not called features but instead are known as variables or
predictors. 


In [19]:
page.metadata

{'source': '/Users/anthonychen/Desktop/Big data and social science a practical guide to methods and tools by Ian Foster, Rayid Ghani, Ron S. Jarmin, Frauke Kreuter, Julia Lane.pdf',
 'page': 200}

In [3]:
#Youtube
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [4]:
#brew install ffprobe and ffmpeg; youtube set rate limits
url="https://www.youtube.com/watch?v=Ffl8b_GfJ-M&ab_channel=TobiasFischer"
save_dir="/Users/anthonychen/Documents/Youtube/"
loader=GenericLoader(YoutubeAudioLoader([url],save_dir), OpenAIWhisperParser())
docs=loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=Ffl8b_GfJ-M&ab_channel=TobiasFischer
[youtube] Ffl8b_GfJ-M: Downloading webpage
[youtube] Ffl8b_GfJ-M: Downloading ios player API JSON
[youtube] Ffl8b_GfJ-M: Downloading android player API JSON
[youtube] Ffl8b_GfJ-M: Downloading m3u8 information
[info] Ffl8b_GfJ-M: Downloading 1 format(s): 140
[download] /Users/anthonychen/Documents/Youtube//Stable Diffusion Consistent Character Animation Technique - Tutorial.m4a has already been downloaded
[download] 100% of   31.65MiB
[ExtractAudio] Not converting audio /Users/anthonychen/Documents/Youtube//Stable Diffusion Consistent Character Animation Technique - Tutorial.m4a; file is already in target format m4a
Transcribing part 1!


RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [6]:
#website chatting
from langchain.document_loaders import WebBaseLoader
loader=WebBaseLoader("https://adriangcoder.medium.com/pandas-tricks-and-tips-a7b87c3748ea")
docs=loader.load()

In [7]:
print(docs[0].page_content[:500]) #need to do postprocessing on info to get workable format

Pandas for time series data — tricks and tips | by Adrian G | MediumPandas for time series data — tricks and tipsAdrian G·Follow7 min read·Oct 24, 2018--2ListenShareThere are some Pandas DataFrame manipulations that I keep looking up how to do. I am recording these here to save myself time. These may help you too.Time series dataConvert column to datetime with given formatdf[‘day_time’] = pd.to_datetime(df[‘day_time’], format=’%Y-%m-%d %H:%M:%S’)0 2012–10–12 00:00:001 2012–10–12 00:30:002 2012–1


In [8]:
#notion databases
from langchain.document_loaders import NotionDirectoryLoader
loader=NotionDirectoryLoader("docs/Notion_DB")
docs=loader.load()

In [9]:
print(docs[0].page_content[0:200])

IndexError: list index out of range

**Few-Shot Learning**

As you feed the model 10 or more examples, the accuracy improves drastically

In [None]:
from langchain.prompts import (
    FewShotChatMessagePromptTemplate,
    ChatPromptTemplate,
)

#insert a few examples
examples=[
    {"input":"" , "output":""},
    {"input":"" , "output":""},
    {"input":"" , "output":""}
]

example_prompt=ChatPromptTemplate.from_messages(
    [
        ("human","{input}"),
        ("ai","{output}")
    ]
)

few_shot_prompt=FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

print(few_shot_prompt.format())


In [None]:
final_prompt=ChatPromptTemplate.from_messages(
    [
        ("system", "You are wonderous wizard of math."),
        few_shot_prompt,
        ("human", "{input}"),
    ]
)

#Input Question
print(final_prompt.format(input=''))

In [None]:
chain=LLMChain(
    llm=chat_model,
    prompt=final_prompt,
    verbose=True
)

chain.run()

**Memetic Proxy**

In [None]:
from langchain.prompts import PromptTemplate

template="""
System: (reference)
Provide a helpful answer to the following question:

Human: {question}

AI:
"""

prompt=PromptTemplate.from_template(template)

chain=LLMChain(
    llm=chat_model,
    prompt=prompt,
    verbose=True
)

high_level="Imagine you are a Professor teaching at the PhD level"
lower_level="Imagine you are a kindergarten teacher"

question="Explain Quantum Mechanics"

chain.run(
    {
        'question':question,
        'reference':high_level
    }
)


Indexing Data

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings=OpenAIEmbeddings(show_progress_bar=True)

vector1=embeddings.embed_query('How are you?')

#embeddings.__dict__

In [None]:
import numpy as np
from numpy.linalg import norm 

def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

vector1=embeddings.embed_query('machine learning')
vector2=embeddings.embed_query('artificial intelligence')
cosine_sim=get_cosine(vector1, vector2)


In [None]:
from langchain.vectorstores import FAISS 

#choose document data to load
index=FAISS.from_documents(data, embeddings)

index.similarity_search_with_relevance_scores("What is machine learning?")

Vector Database retrieval

Question converted to embedding, then search within index (created with Pinecone) of the vector database to get nearest neighbors, pass them into a prompt, provide prompt to LLM and get answer to user

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

retriever=index.as_retriever()
#number of vectors to retrieve
retriever.search_kwargs['fetch_k']=20
#diversify the information provided to LLM
retriever.search_kwargs['maximal_marginal_relevance']=True
#final number of data context vectors provided
retriever.search_kwargs['k']=10

llm=ChatOpenAI()

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

chain.run("What is machine learning?",
          callbacks=[handler])

Load data into vector database

In [None]:
%pip install pinecone-client

In [None]:
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

#name originally input in pinecone
index_name=''
db=Pinecone.from_documents(
    data, #context provided from document loader
    embeddings,
    index_name=index_name
)

chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    verbose=True
)

chain.run("What is machine learning?",
          callbacks=[handler])

Display sources for LLM output

In [None]:
%pip install newsapi-python

In [None]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi=NewsApiClient(api_key=NEWS_API_KEY)
today=date.today()
last_week=today-timedelta(days=7)

#dictionary of responses 
latest_news=newsapi.get_everything(
    q='artifical intelligence',
    from_param=last_week.strftime('%Y-%m-%d'),
    to=today.strftime('%Y-%m-%d'),
    sort_by='relevancy',
    language='en'
)

In [None]:
from langchain.docstore.document import Document 
docs=[
    Document( 
    page_content=article['title']+'\n\n'+article['description'],
    metadata={
        'source':article['url'],
    }
        , ) for article in latest_news['articles']
]

In [None]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

qa_chain=create_qa_with_sources_chain(llm)

doc_prompt=PromptTemplate(
    template='Content: {page_content}\nSource:{source}',
    input_variables=['page_content','source'],
)

final_qa_chain=StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index=FAISS.from_documents(docs, )

chain=RetrievalQA(
    retriever=index.as_retriever(),
    combine_documents_chain=final_qa_chain
)



In [None]:
question="""What is the most important news about artificial intelligence in the last week?"""

answer=chain.run(question)

print(answer)

Indexing from a website

In [None]:
%pip install apify-client chromadb #webcrawler & local vector db 

In [None]:
from langchain.utilities import ApifyWrapper
from langchain.document_loeaders.base import Document 

apify=ApifyWrapper()

loader=apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls":[{"url":""}], #insert url 
        "aggressivePrune":True,
    },
    dataset_mapping_function=lambda item: Document(
        page_content=item['text'] or "", metadata={"source":item['url']}
    ),
)

In [None]:
from langchain.indexes import VectorstoreIndexCreator

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

index=VectorstoreIndexCreator(
    text_splitter=text_splitter
).from_loaders([loader])

index

In [None]:
query="What is the main subject of this ..."

index.query_with_sources(query)

In [None]:
#retriever=index.vectorstore.as_retriever()

Indexing GitHub Repo

In [None]:
from langchain.document_loaders import GitLoader 

loader=GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./data/repo/",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch='master',
)

documents=loader.load()

In [None]:
from langchain.text_splitter import Language

python_splitter=RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1000,
    chunk_overlap=200,
)

documents=python_splitter.split_documents(documents)

In [None]:
index=FAISS.from_documents(documents, embeddings)
retriever=index.as_retriever()

#distance metric
retriever.search_kwargs['distance_metric']='cos'
#number of vectors to retrieve
retriever.search_kwargs['fetch_k']=200
#diversify the information provided to LLM
retriever.search_kwargs['maximal_marginal_relevance']=True
#final number of data context vectors provided
retriever.search_kwargs['k']=10

qa=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever 
)

Stuff Chain

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

index=Chroma.from_documents(
    docs,
    embeddings=OpenAIEmbeddings()
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

llm=ChatOpenAI()

#map_rerank returns answer with highest score
chain=RetrievalQA.from_chain_type(
    llm=llm,
    retriever=index.as_retriever(),
    chain_type='stuff', #map-reduce, refine,map_rerank
    verbose=True
)

chain.run(
    '?', #insert Question
    callbacks=[StdOutcallbackHandler()]
)