# Introduction.
- This simple code shows how pretrained LLM can be better for QA for wine recommendation, with RAG.
- One of posts is used for vector DB and hence retrieved document.
  - Website: https://www.marketviewliquor.com/blog/
  - Post: https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/#How_to_Pick_out_Wine_for_Dinner
- Base model is `google/flan-t5-large`, which is balanced for both performance and resources for real-time QA, in various running environments.

# 0. Setup.

In [1]:
# Imports.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import os

import nbimporter

# Random seeds.
from transformers import set_seed
import tensorflow as tf

set_seed(42)                  # For HF.
tf.random.set_seed(42)    # For tf, np, and python.

# Suppress warnings
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

import warnings
warnings.filterwarnings("ignore")

In [2]:
%%html
<style>
    table {
        float: left;
        margin-right: 20px; /* Optional: Adds space between table and other content */
    }
</style>

# 1. Vector DB.

## 1.1. Prompt.

In [3]:
prompt = """\
You are chatbot that recommends a wine. Recommend a wine, based on Query and Retrieved Chunks.
Query: {query}
Retrieved Chunks:
{retrieved_chunks}
"""

## 1.2. Fetch data from url.

In [4]:
import os
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from datasets import Dataset
from datetime import datetime
import time

async def fetch_url_async(session, url):
    """Fetch content from a URL asynchronously."""
    try:
        async with session.get(url, timeout=10) as response:
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            
            # Custom rule for 'https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/?'.
            paragraphs = soup.find('div', class_="entry-content").find_all('p')
            items      = soup.find('div', class_="lwptoc_item")
            spans      = soup.find('div', class_="entry-content").find_all('span')
            
            txts = []
            txts.extend(paragraphs)
            txts.extend(items)
            txts.extend(spans)
            txts = "\n\n".join(txt.get_text(separator='. ', strip=True) for txt in txts)
            
            # Clean txts.
            txts = txts.replace(u'\xa0', u' ')
            
            return txts
            
    except asyncio.TimeoutError:
        return "Timeout occurred"
    except Exception as e:
        return f"Failed to load URL: {str(e)}"

async def fetch_all_urls_async(doc_urls):
    """Fetch all URLs concurrently using aiohttp."""
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url_async(session, url) for url in doc_urls]
        return await asyncio.gather(*tasks)



In [5]:
# Prepare URLs.
doc_urls = [
    'https://www.marketviewliquor.com/blog/how-to-choose-a-good-wine/?'
]

# Load Contents from URLs.
doc_contents = await fetch_all_urls_async(doc_urls)

## 1.3. Construct FAISS.

In [63]:
# Suppress chunk size warning during split.
import logging
logging.getLogger("langchain_text_splitters.base").setLevel(logging.ERROR)

# Text splitter.
from langchain.text_splitter import CharacterTextSplitter
chunk_size    = 200
chunk_overlap = 0

text_splitter = CharacterTextSplitter(separator=". ", 
                                      chunk_size=chunk_size, 
                                      chunk_overlap=chunk_overlap)

chunks = text_splitter.split_text(doc_contents[0])

# Embedding model.
from langchain_huggingface import HuggingFaceEmbeddings

device       = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_kwargs = {'device': device}

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                        model_kwargs   = model_kwargs)

# Save on FAISS.
from langchain.vectorstores import FAISS

vector_store = FAISS.from_texts(chunks, embedding_model)


# 2. Query and Retrieve.

## 2.1. Query.

In [95]:
query_list = [
    "I'm new to wine. Which wine should I start?",
    "What should I consider to choose a good wine?",
    "Which food is good with sweet wine?",
    "Does wine always become better as getting older?",
    "Is price important for wine?",
    "How long can I consume a wine after purchase?",
    "Are wines with screw caps bad?",
    "How can I log the history of wine consumption?",
    "Which categories should I check on the label of wine?"
]

# Answer.

## `google/flan-t5-base`

1. I'm new to wine. Which wine should I start with?  
   - **No-RAG:** Sauvignon Blanc – Acceptable, as it's crisp and approachable.  
   - **RAG:** White or Rosé – More accurate since both are beginner-friendly wines.

2. What wine pairs well with spicy food?  
   - **No-RAG:** Chardonnay – Incorrect; its flavors don't complement spice well.  
   - **RAG:** Riesling – Correct; sweetness balances spice.

3. Best dessert for wine  
   - **No-RAG:** A chocolate cake – Reasonable but not ideal.  
   - **RAG:** Cheesecake – More versatile and pairs better with most dessert wines.

4. What is the best wine to serve at a wedding?  
   - **No-RAG:** Rosé – Acceptable but not ideal for diverse preferences.  
   - **RAG:** A wine with high acidity – Vague but applicable (e.g., sparkling wines).

5. What is a good wine for a romantic dinner?  
   - **No-RAG:** Sauvignon Blanc – Acceptable but lacks elegance for romance.  
   - **RAG:** A higher-acidity wine – Vague but could include good options like Pinot Noir.

6. Which wine is ideal for a vegetarian meal?  
   - **No-RAG:** Sauvignon Blanc – Acceptable for many vegetarian dishes.  
   - **RAG:** Riesling – More versatile for a variety of vegetarian cuisines.


In [41]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load model and tokenizer
checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

print(f"Model: {checkpoint}", end='\n\n')

# Retrieval settings (for RAG)
top_k_retrieval = 5

for query in query_list:
    # --- No-RAG ---
    input_text_no_rag = f"Answer the following question based on your knowledge: {query}"
    inputs_no_rag = tokenizer(input_text_no_rag, return_tensors="pt").to(device)

    outputs_no_rag = model.generate(
        inputs_no_rag['input_ids'], 
        attention_mask=inputs_no_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=100,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_no_rag = tokenizer.decode(outputs_no_rag[0], skip_special_tokens=True).strip()

    # --- RAG ---
    # Perform retrieval
    doc = vector_store.similarity_search(query, k=top_k_retrieval)
    retrieved_docs = [d.page_content for d in doc]
    retrieved_docs = ''.join(retrieved_docs)

    # Prepare input
    input_text_rag = f"Answer the following question based on your knowledge and the context:\n\nQuestion: {query}\nContext: {retrieved_docs}\nAnswer:"
    inputs_rag = tokenizer(input_text_rag, return_tensors="pt", truncation=True).to(device)

    outputs_rag = model.generate(
        inputs_rag['input_ids'],
        attention_mask=inputs_rag['attention_mask'],
        temperature=0.8,
        top_k=50,
        top_p=0.95,
        max_new_tokens=50,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_rag = tokenizer.decode(outputs_rag[0], skip_special_tokens=True).strip()

    # Print results
    print(f"Question: {query}")
    print(f"No-RAG: {reply_no_rag}")
    print(f"RAG: {reply_rag}", end='\n\n')


Model: google/flan-t5-base

Question: I'm new to wine. Which wine should I start with?
No-RAG: Sauvignon Blanc
RAG: white or rose

Question: What wine pairs well with spicy food?
No-RAG: Chardonnay
RAG: Riesling

Question: Best dessert for wine
No-RAG: a chocolate cake
RAG: cheesecake

Question: What is the best wine to serve at a wedding?
No-RAG: rosé
RAG: a wine with high acidity

Question: What is a good wine for a romantic dinner?
No-RAG: Sauvignon Blanc
RAG: a higher-acidity wine

Question: Which wine is ideal for a vegetarian meal?
No-RAG: Sauvignon Blanc
RAG: Riesling



## `google/flan-t5-large`

In [97]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datetime import datetime

# Load model and tokenizer
checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

print(f"Model: {checkpoint}", end='\n\n')

# Retrieval settings (for RAG)
top_k_retrieval = 1

import datetime

# Initialize a list to store results
results = []

for query in query_list:
    # --- No-RAG ---
    input_text_no_rag = f"Answer the following question based on your knowledge: {query}"
    inputs_no_rag = tokenizer(input_text_no_rag, return_tensors="pt").to(device)

    outputs_no_rag = model.generate(
        inputs_no_rag['input_ids'], 
        attention_mask=inputs_no_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=150,
        repetition_penalty=2.0,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_no_rag = tokenizer.decode(outputs_no_rag[0], skip_special_tokens=True).strip()

    # --- RAG ---
    # Perform retrieval
    doc = vector_store.similarity_search(query, k=top_k_retrieval)
    retrieved_docs = [d.page_content for d in doc]
    retrieved_docs = ''.join(retrieved_docs)

    # Prepare input
    input_text_rag = f"Answer the following question in detail based on your knowledge and the context:\n\nQuestion: {query}\nContext: {retrieved_docs}\nAnswer:"
    inputs_rag = tokenizer(input_text_rag, return_tensors="pt", truncation=True).to(device)

    outputs_rag = model.generate(
        inputs_rag['input_ids'],
        attention_mask=inputs_rag['attention_mask'],
        temperature=0.9,
        top_k=50,
        top_p=0.85,
        max_new_tokens=150,
        repetition_penalty=0.5,
        pad_token_id=tokenizer.eos_token_id
    )

    reply_rag = tokenizer.decode(outputs_rag[0], skip_special_tokens=True).strip()

    # Append the result to the list
    result = f"# Question: {query}\n"
    result += f"-- No-RAG: {reply_no_rag}\n"
    result += f"-- RAG: {reply_rag}\n"
    
    retrieved_docs = retrieved_docs.split('\n')
    result += "-- Contexts:\n"
    result += "\n".join(retrieved_docs).replace("\n\n", "\n") + "\n\n"
    
    results.append(result)

# Write all results to the file after the loop
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"reply_{current_time}.txt"

with open(file_name, 'w', encoding='utf-8') as file:
    file.writelines(results)

print(f"Output written to {file_name}")



Model: google/flan-t5-large

Output written to reply_2025-01-07_14-58-03.txt
