### Load API

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

GENERATE_MODEL_NAME = os.getenv("GENERATE_MODEL_NAME")
EMBEDDINGS_MODEL_NAME = os.getenv("EMBEDDINGS_MODEL_NAME")

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

NGROK_STATIC_DOMAIN = os.getenv("NGROK_STATIC_DOMAIN")
NGROK_TOKEN = os.getenv("NGROK_TOKEN")

### Load model and tokenizer directly (tokyotech-llm)

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.2").to(device)
tokenizer = AutoTokenizer.from_pretrained("tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.2")

`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Load embeding model

In [3]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_ollama import OllamaEmbeddings

def embedding_model(model_choice: str) -> object:
    if model_choice == "llama3":
        embedding = OllamaEmbeddings(model=model_choice)
    elif model_choice == "Sentence Transformers":
        embedding = HuggingFaceInferenceAPIEmbeddings(
            model_name=EMBEDDINGS_MODEL_NAME,
            api_key=HUGGINGFACE_API_KEY)
    return embedding

### Crawl Web URL

In [4]:
import os
import re
import json
from langchain_community.document_loaders import RecursiveUrlLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup
#from fake_useragent import UserAgent

#os.environ['USER_AGENT'] = UserAgent().chrome

USER_AGENT environment variable not set, consider setting it to identify your requests.


#### Extract Content from HTMl

In [5]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

#### Crawl Web data use recursion

In [6]:
def craw_web(url_data):
    loader = RecursiveUrlLoader(url=url_data, extractor=bs4_extractor, max_depth=4)
    docs = loader.load()
    print('length: ', len(docs))
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
    all_splitts = text_splitter.split_documents(docs)
    print('length_all_splits: ', len(all_splitts))
    return all_splitts

#### Load data from single URL (no recursion)

In [7]:
def web_base_loader(url_data):
    loader = WebBaseLoader(url_data)
    docs = loader.load()
    print('length: ', len(docs))
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
    all_splitts = text_splitter.split_documents(docs)
    return all_splitts

#### Save data to local

In [8]:
def save_data_locally(documments, filename, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    file_path = os.path.join(directory, filename)
    
    data_to_save = [{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in documments]
    
    with open(file_path, 'w') as file:
        json.dump(data_to_save, file, indent=4)
    print(f'Data saved to {file_path}')

### Connect to database

In [9]:
from uuid import uuid4
from langchain.schema import Document
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient

#### Load Data from local

In [10]:
def load_data_from_local(filename: str, directory: str) -> tuple:
    file_path = os.path.join(directory, file_path)
    with open(file_path, 'r') as file:
        data = json.load(file)
    print(f'Data loaded from {file_path}')
    
    return data, filename.rsplit('.', 1)[0].replace('_', ' ')

#### Connect to the Qdrant Vector DataBase

In [11]:
# def connect_to_qdrant(URI_link: str, collection_name: str) -> Qdrant:
def connect_to_qdrant(model_name: str, url: str, api: str, collection_name: str) -> Qdrant:
    embedding = embedding_model(model_name)
    client = QdrantClient(url=url,
                          api_key=api,
                          prefer_grpc=False)
    db = Qdrant(client=client,
                embeddings=embedding,
                collection_name=collection_name)
    return db

#### Qdrant

In [12]:
# def vector_em(URI_link: str, collection_name: str, filename: str, directory: str, use_ollama: bool = False) -> Qdrant:

def vector_em(collection_name: str, filename: str, directory: str, use_ollama: bool = False) -> Qdrant:
    if use_ollama: 
        model_name = "llama3"
    else:
        model_name = "Sentence Transformers"
        
    local_data, doc_name = load_data_from_local(filename, directory)
    documents = [
        Document(
            page_content=doc.get('page_content') or '',
            metadata={
                'source': doc['metadata'].get('source') or '',
                'content_type': doc['metadata'].get('content_type') or 'text/plain',
                'title': doc['metadata'].get('title') or '',
                'description': doc['metadata'].get('description') or '',
                'language': doc['metadata'].get('language') or 'en',
                'doc_name': doc_name,
                'start_index': doc['metadata'].get('start_index') or 0
            }
        )
        for doc in local_data
    ]
    print('documents: ', documents)
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vectorstore = connect_to_qdrant(model_name=model_name, 
                                    url=QDRANT_URL,
                                    api=QDRANT_API_KEY, 
                                    collection_name=QDRANT_COLLECTION_NAME)
    vectorstore.add_documents(documents=documents, ids=uuids)
    print('vector: ', vectorstore)
    return vectorstore

#### Qdrant Live

In [13]:
# def qdrant_live(URL: str, URI_link: str, collection_name: str, doc_name: str, use_ollama: bool = False) -> Qdrant:

def qdrant_live(URL: str, collection_name: str, doc_name: str, use_ollama: bool = False) -> Qdrant:
    if use_ollama: 
        model_name = "llama3"
    else:
        model_name = "Sentence Transformers"
    documents = craw_web(URL)
    for doc in documents:
        metadata={
            'source': doc['metadata'].get('source') or '',
            'content_type': doc['metadata'].get('content_type') or 'text/plain',
            'title': doc['metadata'].get('title') or '',
            'description': doc['metadata'].get('description') or '',
            'language': doc['metadata'].get('language') or 'en',
            'doc_name': doc_name,
            'start_index': doc['metadata'].get('start_index') or 0
        }
        doc.metadata = metadata
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vectorstore = connect_to_qdrant(model_name=model_name, 
                                    url=QDRANT_URL,
                                    api=QDRANT_API_KEY, 
                                    collection_name=QDRANT_COLLECTION_NAME)
    vectorstore.add_documents(documents=documents, ids=uuids)
    print('vector: ', vectorstore)
    return vectorstore

### Create Vector Store Retriever

In [14]:
from langchain.schema.document import Document
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.retrievers import WikipediaRetriever
from typing import List
from langchain_community.retrievers import BM25Retriever # Retriever base on BM25
from collections import defaultdict
#from langchain.retrievers import EnsembleRetriever # Combine lost of retriever

#### RerankRetriever(AutoTokenizer)
Can then re-rank the top candidates from the first stage for higher accuracy in critical cases.

In [15]:
class RerankRetriever(VectorStoreRetriever):
    vectorstore: VectorStoreRetriever
    def get_relevant_documents(self, query: str) -> List[Document]: # Return the similar document
        docs = self.vectorstore.get_relevant_documents(query=query)
        candidates = [doc.page_content for doc in docs]
        queries = [query]*len(candidates)
        features = tokenizer(queries, candidates, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            scores = model(**features).logits
            values, indices = torch.sum(scores, dim=1).sort(descending=True)
            #relevant_docs = doc[indices[0]]
        return [docs[indices[0]], docs[indices[1]]] 

  class RerankRetriever(VectorStoreRetriever):


#### RerankWikiRetriever(AutoTokenizer)

In [16]:
class RerankWikiRetriever(VectorStoreRetriever):
    vectorstore: WikipediaRetriever
    def get_relevant_documents(self, query: str) -> List[Document]: # Return the similar document
        docs = self.vectorstore.get_relevant_documents(query=query)
        candidates = [doc.page_content for doc in docs]
        queries = [query]*len(candidates)
        features = tokenizer(queries, candidates, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            scores = model(**features).logits
            values, indices = torch.sum(scores, dim=1).sort(descending=True)
            #relevant_docs = docs[indices[0]]
        return [docs[indices[0]], docs[indices[1]]]   

  class RerankWikiRetriever(VectorStoreRetriever):


#### Retrievre(Embedding)
Can be used as a first-stage retrieval system to fetch a larger set of documents quickly.

In [17]:
class get_retriever(VectorStoreRetriever):
    vectorstore: VectorStoreRetriever
    def __init__(self, qdrant_weight=0.7, bm25_weight=0.3):
        self.weights = [qdrant_weight, bm25_weight]
    def get_relevant_documents(self, query: str, k: int = 4) -> List[Document]:
        qdrant_retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs = {"k": k}
        )
        qdrant_docs = self.vectorstore.get_relevant_documents(query)
        documents = [
            Document(page_content = doc.page_content, metadata = doc.metadata)
            for doc in self.vectorstore.similarity_search("", k = 100)
        ]
        if not documents:
            raise ValueError(f"Not found documents in the collection '{QDRANT_COLLECTION_NAME}'")
        bm25_retriever = BM25Retriever.from_documents(documents)
        bm25_retriever.k = k
        bm25_docs = bm25_retriever.get_relevant_documents(query)
        socred_docs = self.combine_and_socre_documents(qdrant_docs, bm25_docs)   
        return socred_docs[:k]
    
    def combine_and_score_documents(self, qdrant_docs: List[Document], bm25_docs: List[Document]) -> List[Document]:
        doc_scores = defaultdict(float) 
        for i, doc in enumerate(qdrant_docs):
            doc_scores[doc] += (self.weights[0] * (1 / (i + 1)))
        for i, doc in enumerate(bm25_docs):
            doc_scores[doc] += (self.weights[1] * (1 / (i + 1)))        
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        return [doc for doc, _ in sorted_docs]

  class get_retriever(VectorStoreRetriever):


### Create LLM Sever

In [18]:
from langchain.retrievers import WikipediaRetriever
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, MultiRetrievalQAChain
from transformers import pipeline
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
import torch
#from langchain.llms import VLLM
#from vllm import LLM, SamplingParams
#from langchain.llms import HuggingFaceHub

class LLMServe:
    def __init__(self) -> None:
        self.embeddings = self.load_embeddings()
        self.current_source = "wiki"
        self.retriever = self.load_retriever(retriever_name=self.current_source, embeddings=self.embeddings)
        self.pipe = self.load_model_pipeline(max_new_tokens=300)
        self.prompt = self.load_prompt_template()
        self.rag_pipeline = self.load_rag_pipeline(llm=self.pipe, retriever=self.retriever, prompt=self.prompt)
    
    def load_embeddings(self):
        embeddings = HuggingFaceInferenceAPIEmbeddings(
            model_name=EMBEDDINGS_MODEL_NAME,
            api_key=HUGGINGFACE_API_KEY,
        )
        return embeddings
    
    def load_retriever(self, retriever_name, embeddings):
        retriever=None
        if retriever_name == "wiki":
            retriever = RerankWikiRetriever(vectorstore=WikipediaRetriever(lang="jp", doc_content_chars_max=800, top_k_results=15))
        #elif retriever_name == "combine":
            #retriever = get_retriever(vectorstore=db)
        else:
            client = QdrantClient(url=QDRANT_URL,
                              api_key=QDRANT_API_KEY,
                              prefer_grpc=False)
            db = Qdrant(client=client, 
                    embeddings=embeddings,
                    collection_name=QDRANT_COLLECTION_NAME)
            retriever = RerankRetriever(vectorstore=db.as_retriever(search_kwargs={"k": 15}))
        return retriever
    
    def load_model_pipeline(self, max_new_tokens=300):
        device = 0 if torch.cuda.is_available() else -1
        pipe = pipeline(
            "text-generation",
            model=GENERATE_MODEL_NAME,
            device= device,
            max_new_tokens=max_new_tokens)
        return pipe
    
    def load_prompt_template(self):
        # query_template = "あなたは東京工業大学のアシスタントです。以下の質問に文脈をもとに回答してください。もし文脈が答えを提供していない、または確実でない場合は、『この情報は分かりませんが、参考文献の情報が役立つかもしれません！』と答えてください。文脈にない情報を作り出さないでください。\n文脈: {context} \n質問: {question}\n回答: "
        # query_template = "文脈を参照してください:{context}\n\n### 質問:{question}\n\n### 回答:"
        query_template = (
            "あなたは文脈(context)に基づいて質問に答える賢いチャットボットです。\n\n"
            "### 文脈: {context} \n\n### 人間: {question}\n\n### アシスタント:")
        prompt = PromptTemplate(
            template=query_template, 
            input_variables=["context", "question"])
        return prompt
    
    def load_rag_pipeline(self, pipe, retriever, prompt):
        rag_pipeline = RetrievalQA.from_chain_type(
            llm=HuggingFacePipeline(pipeline=pipe),
            chain_type='stuff',
            retriever=retriever,
            chain_type_kwargs={"prompt": prompt},
            return_source_documents=True)
        return rag_pipeline
    
    def rag(self, source):
        if source == self.current_source:
            return self.rag_pipeline
        else:
            self.retriever = self.load_retriever(retriever_name=source, embeddings=self.embeddings)
            self.rag_pipeline = self.load_rag_pipeline(llm=self.pipe, retriever=self.retriever, prompt=self.prompt)
            self.current_source = source
            return self.rag_pipeline

In [19]:
app = LLMServe()

`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192
`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got 8192 and max_position_embeddings=8192


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

: 

: 

: 