In [3]:
import os
import openai
from dotenv import load_dotenv
load_dotenv('key.env')
openai.api_key = os.getenv('OPENAI_API_KEY')

## Testing that Open AI request work

In [2]:
#completion = openai.ChatCompletion.create(
#  model="gpt-3.5-turbo",
#  messages=[
#    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
#    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
#  ]
#)

#print(completion.choices[0].message)

## Langchain: Getting Started

https://python.langchain.com/docs/get_started/quickstart

In [3]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI(openai_api_key=openai.api_key)
chat_model = ChatOpenAI(openai_api_key=openai.api_key)

In [5]:
from langchain.schema import HumanMessage

text = "What would be a good company name for a company that makes colorful socks?"
messages = [HumanMessage(content=text)]

llm.invoke(text)
# >> Feetful of Fun

chat_model.invoke(messages)
# >> AIMessage(content="Socks O'Color")

AIMessage(content='VividSock Co.')

In [6]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template("What is a good name for a company that makes {product}?")
prompt.format(product="colorful socks")

'What is a good name for a company that makes colorful socks?'

In [7]:
from langchain.schema import BaseOutputParser

class CommaSeparatedListOutputParser(BaseOutputParser):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str):
        """Parse the output of an LLM call."""
        return text.strip().split(", ")

CommaSeparatedListOutputParser().parse("hi, bye")

['hi', 'bye']

We can now combine all these into one chain. This chain will take input variables, pass those to a prompt template to create a prompt, pass the prompt to a language model, and then pass the output through an (optional) output parser. This is a convenient way to bundle up a modular piece of logic. Let's see it in action!

In [8]:
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser

class CommaSeparatedListOutputParser(BaseOutputParser[List[str]]):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str) -> List[str]:
        """Parse the output of an LLM call."""
        return text.strip().split(", ")

template = """You are a helpful assistant who generates comma separated lists.
A user will pass in a category, and you should generate 5 objects in that category in a comma separated list.
ONLY return a comma separated list, and nothing more."""
human_template = "{text}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])
chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()
chain.invoke({"text": "colors"})
# >> ['red', 'blue', 'green', 'yellow', 'orange']

['red', 'blue', 'green', 'yellow', 'orange']

## Scraping data using Langchain 

In [4]:
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import BeautifulSoupTransformer

# Load HTML
loader = AsyncChromiumLoader(["https://www.wsj.com"])
#html = loader.load()

In [11]:
from langchain.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|#############################| 2/2 [00:00<00:00, 12.27it/s]


In [12]:
from langchain.document_loaders import AsyncHtmlLoader

urls = ["https://www.espn.com", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|#############################| 2/2 [00:00<00:00, 18.75it/s]


In [13]:
from langchain.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content[0:500]

'Skip to main content  Skip to navigation\n\n<\n\n>\n\nMenu\n\n## ESPN\n\n  *   *   * scores\n\n  * NFL\n  * NCAAF\n  * NBA\n  * NHL\n  * MLB\n  * Soccer\n  * …\n\n    * NCAAM\n    * NCAAW\n    * Sports Betting\n    * Boxing\n    * CFL\n    * NCAA\n    * Cricket\n    * F1\n    * Golf\n    * Horse\n    * LLWS\n    * MMA\n    * NASCAR\n    * NBA G League\n    * Olympic Sports\n    * PLL\n    * Racing\n    * RN BB\n    * RN FB\n    * Rugby\n    * Tennis\n    * WNBA\n    * WWE\n    * X Games\n    * XFL\n\n  * More ESPN\n  * Fantasy\n  * Listen\n  *'

## Let's build a tool that scrapes data from Ssense and gives us information about the current products available

In [4]:
url = 'https://www.ssense.com/en-us/men/shoes'
isabel_marant_shoes_url = 'https://www.ssense.com/en-us/men/product/isabel-marant/white-and-navy-alseeh-high-sneakers/14760201'

#### Create class fasion_website function

In [119]:
from langchain.document_loaders import WebBaseLoader

## let's get all the URLs from Ssense: https://www.ssense.com/sitemap.xml
file = open("./Ssense/ssense_urls_dump1.txt", "r")
content = file.read()
len(content.split('https'))

lst_urls_dump1 = []

for i in content.split('daily'):
    lst_urls_dump1.append(i)

data_all = []

#for url in lst_urls_dump1:
    
#    try: 
#        loader = WebBaseLoader(url, header_template={
#              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
#          })
        
#        data = loader.load() 

#        if data[0].metadata['description'] != '404 not found':
            
            
#    except:
     
#   data = loader.load()

In [4]:
from langchain.document_loaders import WebBaseLoader

lst_urls = ['https://www.ssense.com/en-us/men/shoes', 
            'https://www.ssense.com/en-us/men/product/isabel-marant/white-and-navy-alseeh-high-sneakers/14760201',
            #'https://villagevanguard.com',
            #'https://villagevanguard.squadup.com/2022.html'
           ]

loader = WebBaseLoader(lst_urls, header_template={
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
  })
data = loader.load()

In [6]:
from langchain.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(data)
docs_transformed[0].page_content[0:500]
#docs_transformed[0]

'Designer shoes for Men | SSENSE Timing is everything—order early to get it by\nthe holidays. Menswear Womenswear Everything else sale search English\nFrançais日本語中文한국어 login wishlist shopping bag (0) (0) SALE ONLY All categories\nACCESSORIES BAGS CLOTHING SHOES Boat Shoes & Moccasins Boots Espadrilles Lace\nUps & Oxfords Monkstraps Sandals Slippers & Loafers Sneakers All designers\n1017 ALYX 9SM11 by Boris Bidjan Saberi42444 Label GroupA-COLD-WALL*A.P.C.AARON\nESHABRAAcne StudiosADER erroradidas Origin'

In [7]:
# Transform
#bs_transformer = BeautifulSoupTransformer()
#docs_transformed = bs_transformer.transform_documents(
#    data, tags_to_extract=["p", "li", "div", "a"]
#)
#docs_transformed[0].page_content[0:500]

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(docs_transformed)

In [9]:
# Embed and store splits

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [10]:
#vectorstore.get()['documents']

## Call LLM model

In [11]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import BaseOutputParser

In [12]:
## Prompt
# https://smith.langchain.com/hub/rlm/rag-prompt

from langchain import hub

rag_prompt = hub.pull("rlm/rag-prompt")

In [13]:
# LLM
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [14]:
# RAG chain
from langchain.schema.runnable import RunnablePassthrough
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | rag_prompt | llm

In [15]:
#rag_chain.invoke('''Is Jason Moran playing at the Village Vanguard on November 26th?''')

In [19]:
rag_chain.invoke('''Does Isabel Marant sell white and navy sneakers? If yes, provide me the following information: 
- Name of the shoes
- Price of the shoes in USD
- Availability in size 45
- Country where the shoes are made''')

AIMessage(content='Yes, Isabel Marant sells white and navy sneakers called "Alseeh High Sneakers". The price of the shoes is $660 USD. The availability in size 45 is limited to only 1 remaining. The shoes are made in an unspecified country.')

In [20]:
rag_chain.invoke('''Does Ssense sell Nike specicial edition Nike shoes in blue and white?''')

AIMessage(content="I don't know if Ssense sells Nike special edition Nike shoes in blue and white.")

In [18]:
#rag_chain.invoke('''Does Isabel Marant sell white and navy sneakers? If yes, provide me the following information: 
#- Name of the shoes
#- Price of the shoes in USD
#- Availability in size 45
#- Country where the shoes are made''')

In [113]:
#template = """You are a helpful assistant who generates comma separated lists.
#A user will pass in a category, and you should generate 5 objects in that category in a comma separated list.
#ONLY return a comma separated list, and nothing more."""
#human_template = "{text}"

#chat_prompt = ChatPromptTemplate.from_messages([
#    ("system", template),
#    ("human", human_template),
#])

#chain = chat_prompt | ChatOpenAI() | CommaSeparatedListOutputParser()
#chain.invoke({"text": "colors"})

## Optimize model

Search for the right knowledge page and then train GPT with the new knowledge.

## Create fashion class: we will have a GPT trained on each website data separately to start with

In [27]:
#requests.get('https://www.ssense.com/en-us/men/shoes').text

In [18]:
import xmltodict
import requests

headers = {'User-Agent': 'Mozilla/5.0'}

r = requests.get("https://www.ssense.com/sitemap.xml", headers=headers)
xml = r.text
raw = xmltodict.parse(xml)

#pages = []
#for info in raw['urlset']['url']:
    # info example: {'loc': 'https://www.paepper.com/...', 'lastmod': '2021-12-28'}
#    url = info['loc']
#    if 'https://www.paepper.com/blog/posts' in url:
#        pages.append({'text': extract_text_from(url), 'source': url})

In [20]:
#import requests
#from bs4 import BeautifulSoup

#headers = {'User-Agent': 'Mozilla/5.0'}

#url = "https://linkedin.com/company/1005"

#r = requests.get(url, headers=headers)
#print(r.text)

#soup = BeautifulSoup(r.text, 'html.parser')
#print(soup.prettify())

In [12]:
import xmltodict
import requests

#site_map_url = 'https://www.ssense.com/sitemap.xml'

#loader_sitemap  = WebBaseLoader(site_map_url, header_template={
#      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
#  })

#sitemap_data = loader_sitemap.load()

In [21]:
from bs4 import BeautifulSoup

def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

In [29]:
#import requests
#extract_text_from('https://www.ssense.com/en-us/men/shoes')

In [43]:
class FashionWebsite:

    def __init__(self, name, base_url, gender, category):
        self.name = name
        self.base_url = base_url
        self.gender = gender
        self.category = category

    def __str__(self):
        return f"{self.name}'s url is {self.base_url}"

    def _url_to_search(self):
        return f"{self.base_url}{self.gender}/{self.category}"

    #def scrape_website_data(self):
        #self._url_to_search()

In [44]:
ssense_shoes_men = FashionWebsite('Ssense', 'https://www.ssense.com/en-us/', 'men', 'shoes')

In [45]:
print(ssense_shoes_men)

Ssense's url is https://www.ssense.com/en-us/


In [46]:
ssense_shoes_men._url_to_search()

'https://www.ssense.com/en-us/men/shoes'