# Introduction.
- This simple code shows how pretrained LLM can be better for QA for wine recommendation, with RAG.
- One of posts is used for vector DB and hence retrieved document.
  - Website: https://www.marketviewliquor.com/blog/
  - Post: https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/#How_to_Pick_out_Wine_for_Dinner
- Base model is `google/flan-t5-large`, which is balanced for both performance and resources for real-time QA, in various running environments.

# 0. Setup.

In [1]:
# Imports.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import os

import nbimporter

# Random seeds.
from transformers import set_seed
import tensorflow as tf

set_seed(42)                  # For HF.
tf.random.set_seed(42)    # For tf, np, and python.

# Suppress warnings
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

import warnings
warnings.filterwarnings("ignore")

In [2]:
%%html
<style>
    table {
        float: left;
        margin-right: 20px; /* Optional: Adds space between table and other content */
    }
</style>

# 1. Vector DB.

## 1.1. Fetch data from url.

In [49]:
import aiohttp
import asyncio
from goose3 import Goose

async def fetch_url_async(session, url, retries=20, timeout=30):
    """Fetch content from a URL asynchronously with retry logic."""
    for attempt in range(retries):
        try:
            async with session.get(url, timeout=timeout) as response:
                scrapper = Goose()  # Goose3 scrapper
                article = scrapper.extract(url)  # Extract article
                txt = article.cleaned_text
                
                print(f"Fetch url: {url} (Attempt {attempt + 1})")
                print(f"Length of fetched: {len(txt)}")
                
                return article
        except asyncio.TimeoutError:
            print(f"Timeout occurred: {url} (Attempt {attempt + 1})")
        except Exception as e:
            print(f"Failed to load URL: {url} (Attempt {attempt + 1}). {str(e)}")
        await asyncio.sleep(2)  # Wait before retrying
    
    # If all attempts fail, log and return failure
    print(f"Failed to fetch URL after {retries} attempts: {url}")
    return f"Failed after {retries} attempts"

async def fetch_all_urls_async(doc_urls):
    """Fetch all URLs concurrently using aiohttp with retry logic."""
    print(f"Num of URLs: {len(doc_urls)}")
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url_async(session, url) for url in doc_urls]
        return await asyncio.gather(*tasks)

In [331]:
import os
import aiohttp
import asyncio
from goose3 import Goose

# Asynchronous function to fetch content from a URL
async def fetch_url_async(session, url, progress):
    """Fetch content from a URL asynchronously."""
    try:
        async with session.get(url, timeout=10) as response:
            scrapper = Goose()  # Goose3 scrapper
            article = scrapper.extract(url)  # Extract article
            txt = article.cleaned_text
            return txt
    except asyncio.TimeoutError:
        return "Timeout occurred"
    except Exception as e:
        print(f"Failed to load URL: {url}. {str(e)}")
        return f"Failed to load URL: {url}. {str(e)}"
    finally:
        # Update progress after each URL is processed
        progress['completed'] += 1
        total = progress['total']
        completed = progress['completed']
        print(f"Progress: {completed}/{total} URLs fetched ({(completed/total)*100:.2f}%)", end='\r')

# Asynchronous function to fetch all URLs concurrently
async def fetch_all_urls_async(doc_urls):
    """Fetch all URLs concurrently using aiohttp with a simple progress indicator."""
    total_urls = len(doc_urls)
    print(f"Num of URLs: {total_urls}")
    progress = {'total': total_urls, 'completed': 0}  # Initialize progress tracker

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url_async(session, url, progress) for url in doc_urls]
        results = await asyncio.gather(*tasks)
    print()  # Move to the next line after progress completion
    return results



In [60]:
# Prepare URLs.
doc_urls = [
    'https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/?'
]

# Load Contents from URLs.
doc_contents = await fetch_all_urls_async(doc_urls)

Num of URLs: 1
Fetch url: https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/? (Attempt 1)
Length of fetched: 19876


In [70]:
doc_contents[0].doc[0].xpath('//h4')

[<Element h4 at 0x1fb14fbb590>,
 <Element h4 at 0x1fb14fbb180>,
 <Element h4 at 0x1fb14fbba90>,
 <Element h4 at 0x1fb14fbbe00>,
 <Element h4 at 0x1fb14fbbe50>,
 <Element h4 at 0x1fb14fbb310>,
 <Element h4 at 0x1fb14fbbdb0>,
 <Element h4 at 0x1fb14fbb090>]

## 1.2. Construct FAISS.

In [282]:
# Suppress chunk size warning during split.
import logging
logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)

# Text splitter.
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 200
chunk_overlap = 20

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap, 
    separators=["\n"]
)


chunks = text_splitter.split_text(doc_contents[0])

# Embedding model.
from langchain_huggingface import HuggingFaceEmbeddings

device       = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_kwargs = {'device': device}

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                        model_kwargs   = model_kwargs)

# Save on FAISS.
from langchain.vectorstores import FAISS

vector_store = FAISS.from_texts(chunks, embedding_model)


In [284]:
chunks[:10]

['When dining in a restaurant, a sommelier, or certified wine specialist, can assist you in selecting a perfect wine, even if you do not know much about wine yourself. By asking about your taste preferences, they can recommend a wine that pairs well with your meal, while complementing your likes and dislikes. But what happens when you’re browsing the shelves or web pages of seemingly endless choices of bottles, wondering how to select a good wine? For those who are not familiar with wine, the label on the bottle or product description — complete with descriptions of flavor notes, characteristics of the wine and origin of the grapes — may only make selecting a bottle even more difficult.',
 '\nThe good news is, understanding some basic information about wine can help you learn more about which wines are likely to be a good fit for you — and which ones you’ll probably want to avoid. By learning about your tastes and the general characteristics of wine, you can become a wine expert in no 

# 2. Query and Prompt.

## 2.1. Query.

In [285]:
query_list = [
    "What should I consider to choose a good wine?",
    "Which food is good with sweet wine?",
    "Is older wine better?",
    "Is price important for wine?",
    "How long can I consume a wine after purchase?",
    "Are wines with screw caps bad?",
    "How can I log wine consumption?",
    "Which categories should I check on the label of wine?"
]

## 2.2. Prompt.

In [14]:
prompt_norag = """\
Answer the following "Question:" in detail, based on your knowledge.
Question: {query}
"""

In [124]:
prompt = """\
Based on your knowledge and your understanding of the retrieved information, provide a detailed answer and explanation to the following question.
Retrieved Information:
{retrieved_docs}
Question:
{query}
"""

In [131]:
prompt = """\
Based on your knowledge and the retrieved information, provide a detailed answer and additional explanation to the following question.
Retrieved Information:
{retrieved_docs}
Question:
{query}
"""

# 3. Answer.

- Model: `google/flan-t5-large`
- DB: One post (https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/?)

In [128]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datetime import datetime

# Load model and tokenizer
checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

print(f"Model: {checkpoint}", end='\n\n')

# Retrieval settings (for RAG)
top_k_retrieval = 3

import datetime

# Initialize a list to store results
results = []

for query in query_list:
    # --- No-RAG ---
    input_text_no_rag = prompt_norag.format(query=query)
    inputs_no_rag = tokenizer(input_text_no_rag, return_tensors="pt").to(device)

    outputs_no_rag = model.generate(
        inputs_no_rag['input_ids'], 
        attention_mask=inputs_no_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=150,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_no_rag = tokenizer.decode(outputs_no_rag[0], skip_special_tokens=True).strip()

    # --- RAG ---
    # Perform retrieval
    doc = vector_store.similarity_search(query, k=top_k_retrieval)
    retrieved_docs = [d.page_content for d in doc]
    retrieved_docs = ''.join(retrieved_docs)

    # Prepare input
    input_text_rag = prompt.format(query=query, retrieved_docs=retrieved_docs)
    inputs_rag = tokenizer(input_text_rag, return_tensors="pt", truncation=True).to(device)

    outputs_rag = model.generate(
        inputs_rag['input_ids'],
        attention_mask=inputs_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=150,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_rag = tokenizer.decode(outputs_rag[0], skip_special_tokens=True).strip()

    # Append the result to the list
    result = f"# Question: {query}\n"
#    result += f"-- Input with prompt: {input_text_rag}\n"
    result += f"-- No-RAG: {reply_no_rag}\n"
    result += f"-- RAG: {reply_rag}\n"
    
    retrieved_docs = retrieved_docs.split('\n')
#    result += "-- Retrieved docs:\n"
#    result += "\n".join(retrieved_docs).replace("\n\n", "\n") + "\n\n"
    
    print(result)
    
    results.append(result)

# Write all results to the file after the loop
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"reply_{current_time}.txt"

with open(file_name, 'w', encoding='utf-8') as file:
    file.writelines(results)

print(f"Output written to {file_name}")

Model: google/flan-t5-large

# Question: What should I consider to choose a good wine?
-- No-RAG: The following are some basic factors to consider when choosing a good wine : The climate of the region where the wine is produced : The soil type : The climate of the region where the wine is produced : The grape variety : The variety of the grape
-- RAG: Because: Since “good wine” is so subjective, knowing how to choose the right wine means considering several factors — including occasion, flavor preferences, labels and price points.

# Question: Which food is good with a sweet wine?
-- No-RAG: A sweet wine is a type of wine that has a high alcohol content. Sweet wines are often served with desserts.
-- RAG: Because: A sweet dessert wine wonderfully caps off a great meal.

# Question: Is older wine better?
-- No-RAG: Older wine has a higher alcohol content. The alcohol content of an older wine is lower than that of a newer wine. Therefore, older wine is generally considered to be better. 

# 4. Expand DB.

## 4.1. Fetch URLs from base. 
- "https://www.marketviewliquor.com/blog/category/wine"

In [414]:
from goose3 import Goose

def get_targeted_urls_with_goose(base_url, max_pages):
    targeted_urls = []
    scrapper = Goose()

    for page in range(1, max_pages + 1):
        # Construct the URL for each page
        url = f"{base_url}/page/{page}" if page > 1 else base_url
        print(f"Fetching: {url}")

        # Use Goose3 to extract the raw HTML
        article = scrapper.extract(url=url)

        # Check if content is valid
        if not article.raw_html:
            print(f"No content found for {url}")
            continue

        # Extract targeted URLs from the raw HTML
        html = article.raw_html
        links = extract_targeted_links_from_html(html)

        # Add unique links to the list
        for link in links:
            if link not in targeted_urls:
                targeted_urls.append(link)

    return targeted_urls

def extract_targeted_links_from_html(html):
    # Extract links that match the desired structure
    links = []
    start = 0
    while True:
        # Look for the specific structure: <a href="URL" rel="bookmark">
        start = html.find('<a href="', start)
        if start == -1:
            break
        start += 9  # Move past '<a href="'
        end = html.find('"', start)
        link = html[start:end]

        # Validate the link to match the desired structure
        if "rel=\"bookmark\"" in html[end:end + 20]:  # Check if 'rel="bookmark"' follows the link
            links.append(link)

    return links

# Usage
base_url = "https://www.marketviewliquor.com/blog/category/wine"
max_pages = 21
doc_urls = get_targeted_urls_with_goose(base_url, max_pages)

# Save urls.txt.
with open('./urls_tmp.txt', 'w') as f:
    for url in doc_urls:
        f.write(url + '\n')

print(f"Found {len(targeted_blog_urls)} targeted blog URLs:")

Fetching: https://www.marketviewliquor.com/blog/category/wine
Fetching: https://www.marketviewliquor.com/blog/category/wine/page/2
Fetching: https://www.marketviewliquor.com/blog/category/wine/page/3
Fetching: https://www.marketviewliquor.com/blog/category/wine/page/4
Fetching: https://www.marketviewliquor.com/blog/category/wine/page/5



KeyboardInterrupt



In [350]:
# Save urls.txt.
with open('./urls.txt', 'w') as f:
    for url in doc_urls:
        f.write(url + '\n')

## 4.2. Fetch doc_contents from URLs.

In [5]:
# Prepare URLs.

"""
it works!!
doc_urls = [
    'https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/',
    'https://www.marketviewliquor.com/blog/what-really-happens-as-wine-ages/',
    'https://www.marketviewliquor.com/blog/how-to-choose-red-wine/',
    'https://www.marketviewliquor.com/blog/how-to-prevent-and-stop-a-hangover/',
    'https://www.marketviewliquor.com/blog/what-is-bourbon-barrel-wine/',
    'https://www.marketviewliquor.com/blog/best-chilean-wines/'
]"""

# Load urls.txt.
doc_urls = []
with open('./urls.txt', 'r') as f:
    for url in f:
        doc_urls.append(url[:-1])

doc_contents = await fetch_all_urls_async(doc_urls)

# Save doc_contents.txt.
with open('./doc_contents_tmp.txt', 'w', encoding='utf-8') as f:
    for doc_content in doc_contents:
        doc_content = doc_content.strip()
        if doc_content:
            f.write(doc_content)

print("Fetch done!")

Num of URLs: 102
Fetch url: https://www.marketviewliquor.com/blog/guide-sweet-wine-types/ (Attempt 1)
Length of fetched: 20514
Fetch url: https://www.marketviewliquor.com/blog/wine-storing-and-serving-temperatures/ (Attempt 1)
Length of fetched: 3362
Fetch url: https://www.marketviewliquor.com/blog/what-really-happens-as-wine-ages/ (Attempt 1)
Length of fetched: 6182
Fetch url: https://www.marketviewliquor.com/blog/unique-wedding-favor-ideas/ (Attempt 1)
Length of fetched: 18671
Fetch url: https://www.marketviewliquor.com/blog/how-to-prevent-and-stop-a-hangover/ (Attempt 1)
Length of fetched: 12358
Fetch url: https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/ (Attempt 1)
Length of fetched: 19876
Fetch url: https://www.marketviewliquor.com/blog/back-to-school-wines/ (Attempt 1)
Length of fetched: 2861
Fetch url: https://www.marketviewliquor.com/blog/what-is-moscato/ (Attempt 1)
Length of fetched: 15333
Fetch url: https://www.marketviewliquor.com/blog/thank-you-wine-gifts/ 

## 4.3. Load doc_contents.

In [82]:
# Load doc_contents.txt.
doc_contents = []
with open('./doc_contents.txt', 'r', encoding='utf-8') as f:
    for doc_content in f:
        doc_content = doc_content.strip()
        if doc_content:
            doc_contents.append(doc_content[:-1])

## 4.4. Save to Vector DB.

In [87]:
# Suppress chunk size warning during split.
import logging
logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)

# Text splitter.
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 200
chunk_overlap = 20

# Delete docs shorter than 'chunk_size'.
min_doc_len  = chunk_size   # minimum len of doc. 
doc_contents = [doc for doc in doc_contents if len(doc) >= min_doc_len]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap, 
    separators=["\n"]
)

# Embedding model.
from langchain_huggingface import HuggingFaceEmbeddings

device       = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_kwargs = {'device': device}

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                        model_kwargs   = model_kwargs)

# Save on FAISS.
from langchain.vectorstores import FAISS

initial_chunks = text_splitter.split_text(doc_contents[0])  # Assuming the first doc_content is used for initialization
vector_store = FAISS.from_texts(initial_chunks, embedding_model)

for doc_content in doc_contents[1:]:
    chunks = text_splitter.split_text(doc_content)
    vector_store.add_texts(chunks)

print(f"Vector store constructed.")
print(f"- Num of docs: {len(doc_contents)}")

Vector store constructed.
- Num of docs: 3059
- Num of chunks: 3059


## 4.5. Query List.

In [177]:
query_list = [
    "What is the most popular wine in Chile?",
    "What should I consider to choose a good wine?",
    "Which food is good with a sweet wine?",
    "Is older wine better?",
    "Is price important for wine?",
    "How long can I consume a wine after purchase?",
    "Are wines with screw caps bad?",
    "How can I log wine consumption?",
    "Which information should I check on the wine label?",
]

#query_list = [
#    "What is the most popular wine in Chile?"
#]

## 4.6. Prompt.

In [156]:
prompt_rag = """\
Based on your knowledge and the retrieved information, answer the following question, and provide additional explanation from the retrieved information.
Retrieved Information:
{retrieved_docs}
Question:
{query}
"""

## 4.7. Answer.

In [183]:
async def process_query(query, top_k_retrieval, prompt_norag, prompt_rag, tokenizer, model, temperature_rag, vector_store,
                        log_retrieved_docs):
    global device
    
    # --- No-RAG ---
    input_text_no_rag = prompt_norag.format(query=query)
    inputs_no_rag = tokenizer(input_text_no_rag, return_tensors="pt").to(device)

    outputs_no_rag = model.generate(
        inputs_no_rag['input_ids'], 
        attention_mask=inputs_no_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=150,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_no_rag = tokenizer.decode(outputs_no_rag[0], skip_special_tokens=True).strip()

    # --- RAG ---
    # Perform retrieval

    # similarity_search
    doc = vector_store.similarity_search(query, k=top_k_retrieval)

    # get embedding
#    query_embedding = embedding_model.embed_query(query)

    # similarity_search_by_vector
#    doc = vector_store.similarity_search_by_vector(query_embedding, k=top_k_retrieval)

    # max_marginal_relevance_search_by_vector
#    doc = vector_store.max_marginal_relevance_search_by_vector(query_embedding, k=top_k_retrieval)

    retrieved_docs_list = [d.page_content for d in doc]
    retrieved_docs = '\n'.join(retrieved_docs_list)

    # Prepare input
    input_text_rag = prompt_rag.format(query=query, retrieved_docs=retrieved_docs)
    inputs_rag = tokenizer(input_text_rag, return_tensors="pt", truncation=True).to(device)

    outputs_rag = model.generate(
        inputs_rag['input_ids'],
        attention_mask=inputs_rag['attention_mask'],
        temperature=temperature_rag,
        top_k=50,
        top_p=0.9,
        max_new_tokens=300,
        repetition_penalty=1.5,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_rag = tokenizer.decode(outputs_rag[0], skip_special_tokens=True).strip()

    # Append the result to the list
    result = f"# Question: {query}\n"
    result += f"-- No-RAG: {reply_no_rag}\n\n"
    result += f"-- RAG: {reply_rag}\n\n"

#    result += f"-- Input with prompt:\n {input_text_rag}\n"

    if log_retrieved_docs:
        result += "\n-- Retrieved docs:\n"
        result += retrieved_docs + "\n\n\n"

    """for doc in retrieved_docs_list:
        result += "---- ---- (doc start) ---- \n"
        for sentence in doc.split('. '):
            sentence = sentence.replace('\n', '')
            result += "---- " + sentence + '. \n'"""

#    print(result)

    print(f"# Question done: {query}")
    print(f"-- No-RAG: {reply_no_rag}\n")
    print(f"-- RAG: {reply_rag}\n")
        
    return result

async def process_query_list(query_list, top_k_retrieval, prompt_norag, prompt_rag, tokenizer, model, temperature_rag, vector_store,
                             log_retrieved_docs=False):
    tasks = []
    for query in query_list:
        tasks.append(process_query(query = query,
                                 top_k_retrieval = top_k_retrieval,
                                 prompt_norag = prompt_norag,
                                 prompt_rag = prompt_rag,
                                 tokenizer = tokenizer,
                                 model = model,
                                   temperature_rag = temperature_rag,
                                   log_retrieved_docs = log_retrieved_docs,
                                 vector_store = vector_store))
    results = await asyncio.gather(*tasks)
    
    return results

In [184]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datetime import datetime
import asyncio

# Load model and tokenizer
checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

print(f"Model: {checkpoint}", end='\n\n')

# Retrieval settings (for RAG)
top_k_retrieval = 5
temperature_rag = 0.9


results = await process_query_list(query_list = query_list,
                                   top_k_retrieval = top_k_retrieval,
                                   prompt_norag = prompt_norag,
                                   prompt_rag = prompt_rag,
                                   tokenizer = tokenizer,
                                   model = model,
                                   temperature_rag = temperature_rag,
                                   vector_store = vector_store)

# Write results to file.
import datetime
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"reply_{current_time}.txt"

with open(file_name, 'w', encoding='utf-8') as file:
    file.writelines(results)

print(f"Output written to {file_name}")

Model: google/flan-t5-large

# Question done: What is the most popular wine in Chile?
-- No-RAG: Chilean wine is a type of red wine that is produced in the Andes region of South America. The most popular wine in Chile is Chardonnay .

-- RAG: Chilean Cabernet Sauvignon

# Question done: What should I consider to choose a good wine?
-- No-RAG: The following are some basic factors to consider when choosing a good wine : The climate of the region where the wine is produced : The soil type : The climate of the region where the wine is produced : The grape variety : The variety of the grape

-- RAG: Since “good wine” is so subjective, knowing how to choose the right wine means considering several factors — including occasion, flavor preferences, labels and price points.

# Question done: Which food is good with a sweet wine?
-- No-RAG: A sweet wine is a type of wine that has a high alcohol content. Sweet wines are often served with desserts.

-- RAG: Sweet wines like Adesso Cagnina di Romag