In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from huggingface_hub import login
import torch
from sentence_transformers import SentenceTransformer
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline 
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI
import os, sys, argparse, time
import pandas as pd
from tqdm import tqdm
from itertools import islice

In [2]:
from dotenv import dotenv_values
from huggingface_hub import login
venv = dotenv_values('.env')
login(token=venv["HF_TOKEN"], add_to_git_credential=True)

MODEL_EMBED = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [None]:
# def create_vector_store(train_file, collection_name, vectordb):
#     resource = train_file

#     chunksize = 800000
#     max_batch_size = 40000
#     cnt = 0
#     vectorstore = None
#     init = 0
#     for chunk in pd.read_csv(resource, chunksize=chunksize):
#         cnt += 1
#         documents = []
#         # ids = []
#         metadatas = []
#         for _, row in tqdm(chunk.iterrows(), total=chunk.shape[0], desc=f"Processing CSV {cnt}"):
#             ct = row['text']
#             if '"' in ct:
#                 ct = ct.replace('"', '')
#             if r'\n' in ct:
#                 ct = ct.replace(r'\n', '')
#             if r'/' in ct:
#                 ct = ct.replace(r'/', '')
#             dic = f""""Context": {ct}"""
#             documents.append(dic)
#             # ids.append(str(row['cid'][1:-1]))
#             metadatas.append({"cid": row['cid']})
#         # print(documents[0], ids[0], metadatas[0])
        
#         device = 'cuda' if torch.cuda.is_available() else 'cpu'
#         model_kwargs = {'device': device, 'trust_remote_code': True}
#         cache_dir = r"../.cache"

#         embeddings = HuggingFaceEmbeddings(
#             model_name=MODEL_EMBED,
#             cache_folder=cache_dir,
#             model_kwargs=model_kwargs,
#             show_progress=True,
#         )
#         # if (len(documents) == len(ids)) and (len(ids) == len(metadatas)):
#         #     print(f"Number of documents: {len(documents)}")
#         #     time.sleep(1)

#         if init == 0:
#             print("Initializing ChromaDB")
#             vectorstore = Chroma.from_texts(
#                 texts=documents,
#                 # ids=ids,
#                 metadatas=metadatas,
#                 embedding=embeddings,
#                 persist_directory=vectordb,
#                 collection_name=collection_name,
#             )
#             init += 1
#         else:
#             vectorstore.add_texts(
#                 texts=documents,
#                 # ids=ids,
#                 metadatas=metadatas,
#                 embedding=embeddings,
#             )
#     print(f"Collection {collection_name} created successfully in {vectordb}")

In [None]:
def create_vector_store_from_dataset(dataset_name, collection_name, vectordb, MODEL_EMBED):
    """
    Tạo vector store từ Hugging Face Dataset.

    Args:
        dataset_name: Tên của dataset trên Hugging Face Hub hoặc đường dẫn đến dataset cục bộ.
        collection_name: Tên của collection trong vectordb.
        vectordb: Đường dẫn đến thư mục lưu trữ vectordb.
        MODEL_EMBED: Tên của mô hình embedding.

    """
    from datasets import load_dataset
    dataset = load_dataset(dataset_name)
    dataset = dataset['train']
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_kwargs = {'device': device} #'trust_remote_code': True} Bỏ trust_remote_code nếu không cần
    cache_dir = r"../.cache"

    embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_EMBED,
        cache_folder=cache_dir,
        model_kwargs=model_kwargs,
        show_progress=True,
    )

    vectorstore = None
    chunk_size = 20000
    
    for i in range(0, len(dataset), chunk_size):
        # Lặp qua tất cả các split (train, test, validation)
        documents = []
        metadatas = []
        chunk = islice(dataset, i, i + chunk_size)
        for row in tqdm(chunk, desc=f"Processing split {chunk}"):
            text = row['context']
            if isinstance(text, list):
                 text = " ".join(text)  # Nối list thành string nếu cần
            # Tiền xử lý text nếu cần
            if '"' in text:
                text = text.replace('"', '')
            if r'\n' in text:
                text = text.replace(r'\n', '')
            if r'/' in text:
                text = text.replace(r'/', '')


            documents.append(text)
            metadatas.append({"cid": row['cid']})

        if vectorstore is None:
            print("Initializing ChromaDB")
            vectorstore = Chroma.from_texts(
                texts=documents,
                metadatas=metadatas,
                embedding=embeddings,
                persist_directory=vectordb,
                collection_name=collection_name,
            )
        else:
            vectorstore.add_texts(
                texts=documents,
                metadatas=metadatas,
                embedding=embeddings,
            )

    print(f"Collection {collection_name} created successfully in {vectordb}")

In [None]:
# def chatbot_response(content, collection_name, vectordb, index):
#     # model_name = 'meta-llama/Llama-3.2-1B-Instruct' if args.model_name == 'llama' else args.model_name
#     cache_dir = r"../.cache"
# #     tokenizer = AutoTokenizer.from_pretrained(model_name)
# #     model = AutoModelForCausalLM.from_pretrained(
# #         model_name,
# #         device_map="cuda",
# #         cache_dir=cache_dir,
# #         torch_dtype=torch.bfloat16,)

# #     pipe = pipeline(
# #         'text-generation',
# #         model=model,
# #         tokenizer=tokenizer,
# #         device_map="auto",
# #         max_new_tokens=600
# #         )
# #     llm = HuggingFacePipeline(
# #         pipeline=pipe,
# #         model_kwargs={'temperature': 0.6, 'top_p': 0.4},
# #     )

#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     model_kwargs = {'device': device, 'trust_remote_code': True}
#     embeddings = HuggingFaceEmbeddings(
#         model_name=MODEL_EMBED,
#         cache_folder=cache_dir,
#         model_kwargs=model_kwargs,
#         show_progress=True,
#         )

#     vectorstore = Chroma(
#         embedding_function=embeddings,
#         persist_directory=vectordb,
#         collection_name=collection_name
#     )
#     # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
#     # qa = ConversationalRetrievalChain.from_llm(llm=llm, 
#     #                                            retriever=vectorstore.as_retriever(search_kwargs={'k':10}),
#     #                                            verbose=False, memory=memory)
#     # chat = qa({'question': f'{content}'})
#     # print(chat['answer'])
    
#     # Retrieve the top 10 contexts and their corresponding cids
#     results = vectorstore.as_retriever(search_kwargs={'k': 10}).get_relevant_documents(content)
#     cids = [doc.metadata['cid'] for doc in results]  # Assuming 'qid' is what you meant by cid
#     # cids = [doc.id for doc in results]
#     print(f"Top 10 cids related to the question: {index} ", *cids)
#     more_cid = ""
#     for i in cids:
#         more_cid += f" {i}"
#     string_result = f"{index} {more_cid}"
#     print(string_result)
#     return string_result

In [3]:
corpus_file = r"NaverLegal/corpus.csv"
dataset_name = "Zappu/Legal-vn"
vectordb_path = r"Docs_Legal_T1"
collection_name = "DocsLegalT1"

In [None]:
# create_vector_store_from_dataset(dataset_name, collection_name, vectordb_path, MODEL_EMBED)

In [None]:
# create_vector_store(corpus_file, collection_name, vectordb_path)

In [None]:
# test_file = r"NaverLegal/public_test.csv"
# test_df = pd.read_csv(test_file)


In [6]:
def chatbot_response_dataset(content, collection_name, vectordb, index, ):
    # model_name = 'meta-llama/Llama-3.2-1B-Instruct' if args.model_name == 'llama' else args.model_name
    cache_dir = r"../.cache"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         device_map="cuda",
#         cache_dir=cache_dir,
#         torch_dtype=torch.bfloat16,)

#     pipe = pipeline(
#         'text-generation',
#         model=model,
#         tokenizer=tokenizer,
#         device_map="auto",
#         max_new_tokens=600
#         )
#     llm = HuggingFacePipeline(
#         pipeline=pipe,
#         model_kwargs={'temperature': 0.6, 'top_p': 0.4},
#     )

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_kwargs = {'device': device, 'trust_remote_code': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_EMBED,
        cache_folder=cache_dir,
        model_kwargs=model_kwargs,
        # show_progress=True,
        # resume_download,
        )

    vectorstore = Chroma(
        embedding_function=embeddings,
        persist_directory=vectordb,
        collection_name=collection_name
    )
    # memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    # qa = ConversationalRetrievalChain.from_llm(llm=llm, 
    #                                            retriever=vectorstore.as_retriever(search_kwargs={'k':10}),
    #                                            verbose=False, memory=memory)
    # chat = qa({'question': f'{content}'})
    # print(chat['answer'])
    
    # Retrieve the top 10 contexts and their corresponding cids
    results = vectorstore.as_retriever(search_kwargs={'k': 10}).get_relevant_documents(content)
    cids = [doc.metadata['cid'] for doc in results]  # Assuming 'qid' is what you meant by cid
    # cids = [doc.id for doc in results]
    # print(f"Top 10 cids related to the question: {index} ", *cids)
    more_cid = ""
    for i in cids:
        more_cid += f" {i[1:-1]}"
    string_result = f"{index} {more_cid}"
    # print(string_result)
    return string_result

In [None]:
# with open('predict.txt', 'w') as f:
#     for _, row in tqdm(test_df.iterrows(), total=10000, desc="Search rows"):
#         content = row['question']
#         qid = row['qid']
#         response = chatbot_response(content, collection_name, vectordb_path, qid)
#         f.write(response + '\n')

In [None]:
from datasets import load_dataset
with open('predict2.txt', 'w') as f:
    dataset = load_dataset(dataset_name)
    dataset = dataset['train']
    dataset.select(range(10))
    for row in tqdm(dataset, desc=f"Processing split {dataset.split}"):
        content = row['question']
        qid = row['qid']
        response = chatbot_response_dataset(content, collection_name, vectordb_path, qid)
        f.write(response + '\n')

  vectorstore = Chroma(
  results = vectorstore.as_retriever(search_kwargs={'k': 10}).get_relevant_documents(content)
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f6110bf8520>>
Traceback (most recent call last):
  File "/home/studio-lab-user/.conda/envs/rag/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
!pip install ipython==8.24.0 ipykernel==6.29.4 ipywidgets==7.8.1 \
jupyter-client==7.4.9 jupyter_core==5.7.2 jupyter_server==2.14.0 \
jupyterlab==3.6.7 nbclient==0.10.0 nbconvert==7.16.3 nbformat==5.10.4 \
notebook==6.5.7 qtconsole==5.5.1 traitlets==5.14.3