In [11]:
import os
import openai
from dotenv import load_dotenv
load_dotenv('key.env')
openai.api_key = os.getenv('OPENAI_API_KEY')

## Testing that Open AI request work

In [18]:
#completion = openai.ChatCompletion.create(
#  model="gpt-3.5-turbo",
#  messages=[
#    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
#    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
#  ]
#)

#print(completion.choices[0].message)

## Langchain: Getting Started

https://python.langchain.com/docs/get_started/quickstart

In [21]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI(openai_api_key=openai.api_key)
chat_model = ChatOpenAI(openai_api_key=openai.api_key)

In [23]:
from langchain.schema import HumanMessage

text = "What would be a good company name for a company that makes colorful socks?"
messages = [HumanMessage(content=text)]

llm.invoke(text)
# >> Feetful of Fun

chat_model.invoke(messages)
# >> AIMessage(content="Socks O'Color")

AIMessage(content='VividSock Co.')

In [24]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template("What is a good name for a company that makes {product}?")
prompt.format(product="colorful socks")

'What is a good name for a company that makes colorful socks?'

In [27]:
from langchain.schema import BaseOutputParser

class CommaSeparatedListOutputParser(BaseOutputParser):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str):
        """Parse the output of an LLM call."""
        return text.strip().split(", ")

CommaSeparatedListOutputParser().parse("hi, bye")

['hi', 'bye']

We can now combine all these into one chain. This chain will take input variables, pass those to a prompt template to create a prompt, pass the prompt to a language model, and then pass the output through an (optional) output parser. This is a convenient way to bundle up a modular piece of logic. Let's see it in action!

In [28]:
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser

class CommaSeparatedListOutputParser(BaseOutputParser[List[str]]):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str) -> List[str]:
        """Parse the output of an LLM call."""
        return text.strip().split(", ")

template = """You are a helpful assistant who generates comma separated lists.
A user will pass in a category, and you should generate 5 objects in that category in a comma separated list.
ONLY return a comma separated list, and nothing more."""
human_template = "{text}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])
chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()
chain.invoke({"text": "colors"})
# >> ['red', 'blue', 'green', 'yellow', 'orange']

['red', 'blue', 'green', 'yellow', 'orange']

## Scraping data using Langchain 

In [17]:
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import BeautifulSoupTransformer

# Load HTML
loader = AsyncChromiumLoader(["https://www.wsj.com"])
html = loader.load()

RuntimeError: asyncio.run() cannot be called from a running event loop

In [6]:
from langchain.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|#############################| 2/2 [00:00<00:00, 14.97it/s]


In [8]:
from langchain.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|#############################| 2/2 [00:00<00:00, 28.50it/s]


In [10]:
from langchain.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content[0:500]

'Skip to main content  Skip to navigation\n\n<\n\n>\n\nMenu\n\n## ESPN\n\n  *   *   * scores\n\n  * NFL\n  * MLB\n  * NCAAF\n  * NBA\n  * NHL\n  * Soccer\n  * …\n\n    * NCAAM\n    * NCAAW\n    * Sports Betting\n    * Boxing\n    * CFL\n    * NCAA\n    * Cricket\n    * F1\n    * Golf\n    * Horse\n    * LLWS\n    * MMA\n    * NASCAR\n    * NBA G League\n    * Olympic Sports\n    * PLL\n    * Racing\n    * RN BB\n    * RN FB\n    * Rugby\n    * Tennis\n    * WNBA\n    * WWE\n    * X Games\n    * XFL\n\n  * More ESPN\n  * Fantasy\n  * Listen\n  *'

## Let's build a tool that scrapes data from Ssense and gives us information about the current products available

In [162]:
url = 'https://www.ssense.com/en-us/men/shoes'
isabel_marant_shoes_url = 'https://www.ssense.com/en-us/men/product/isabel-marant/white-and-navy-alseeh-high-sneakers/14760201'

In [184]:
from langchain.document_loaders import WebBaseLoader

## let's get all the URLs from Ssense: https://www.ssense.com/sitemap.xml
file = open("./Ssense/ssense_urls_dump1.txt", "r")
content = file.read()
len(content.split('https'))

lst_urls_dump1 = []

for i in content.split('https')[1:]:
    lst_urls_dump1.append('https'+i)

data_all = []

for url in lst_urls_dump1:
    loader = WebBaseLoader(url, header_template={
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
      })
    data = loader.load()

In [166]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(isabel_marant_shoes_url, header_template={
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
  })
data = loader.load()

In [136]:
#from langchain.document_transformers import Html2TextTransformer

#html2text = Html2TextTransformer()
#docs_transformed = html2text.transform_documents(data)
#docs_transformed[0].page_content[0:500]

In [126]:
#bs_transformer = BeautifulSoupTransformer()
#docs_transformed = bs_transformer.transform_documents(
#data)

In [168]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

In [169]:
# Embed and store splits

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

## Call LLM model

In [170]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser

In [171]:
## Prompt
# https://smith.langchain.com/hub/rlm/rag-prompt

from langchain import hub

rag_prompt = hub.pull("rlm/rag-prompt")

In [172]:
rag_prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [173]:
# LLM
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [174]:
# RAG chain
from langchain.schema.runnable import RunnablePassthrough
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm

In [179]:
#rag_chain.invoke("Give me a list of Isabel Marant shoes.")

In [180]:
#rag_chain.invoke("Are the Isabel Marant: Wh#ite & Navy Alseeh High Sneakers available in size 45? If yes, please give me the price.")

In [113]:
#template = """You are a helpful assistant who generates comma separated lists.
#A user will pass in a category, and you should generate 5 objects in that category in a comma separated list.
#ONLY return a comma separated list, and nothing more."""
#human_template = "{text}"

#chat_prompt = ChatPromptTemplate.from_messages([
#    ("system", template),
#    ("human", human_template),
#])

#chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()
#chain.invoke({"text": "colors"})