In [2]:
import logging
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

# import click
import torch
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import JSONLoader, MathpixPDFLoader
import sys 

from src.constants import (
    CHROMA_SETTINGS,
    DOCUMENT_MAP,
    EMBEDDING_MODEL_NAME,
    INGEST_THREADS,
    PERSIST_DIRECTORY,
    SOURCE_DIRECTORY,
    cfg
)



def file_log(logentry):
   file1 = open(cfg.STORAGE.INGEST_LOG,"a")
   file1.write(logentry + "\n")
   file1.close()
   print(logentry + "\n")

def load_single_document(file_path: str) -> Document:
    # Loads a single document from a file path
    try:
       file_extension = os.path.splitext(file_path)[1]
       loader_class = DOCUMENT_MAP.get(file_extension)
       if loader_class:
           file_log(file_path + ' loaded.')
           if loader_class == JSONLoader:
              loader = loader_class(file_path, jq_schema=".data[].instruction", text_content=False)
              return loader.load()
           else:
               loader = loader_class(file_path)
       else:
           file_log(file_path + ' document type is undefined.')
           raise ValueError("Document type is undefined")
       raw_text = loader.load()[0]
       print(raw_text.page_content)
       return raw_text
    except Exception as ex:
       file_log('%s loading error: \n%s' % (file_path, ex))
       return None 

def load_document_batch(filepaths):
    logging.info("Loading document batch")
    # create a thread pool
    with ThreadPoolExecutor(len(filepaths)) as exe:
        # load files
        futures = [exe.submit(load_single_document, name) for name in filepaths]
        # collect data
        if futures is None:
           file_log(name + ' failed to submit')
           return None
        else:
           data_list = [future.result() for future in futures]
           # return data and file paths
           return (data_list, filepaths)


def load_documents(source_dir: str) -> list[Document]:
    # Loads all documents from the source documents directory, including nested folders
    paths = []
    for root, _, files in os.walk(source_dir):
        for file_name in files:
            print('Importing: ' + file_name)
            file_extension = os.path.splitext(file_name)[1]
            source_file_path = os.path.join(root, file_name)
            if file_extension in DOCUMENT_MAP.keys():
                paths.append(source_file_path)

    # Have at least one worker and at most INGEST_THREADS workers
    n_workers = min(INGEST_THREADS, max(len(paths), 1))
    chunksize = round(len(paths) / n_workers)
    docs = []
    with ProcessPoolExecutor(n_workers) as executor:
        futures = []
        # split the load operations into chunks
        for i in range(0, len(paths), chunksize):
            # select a chunk of filenames
            filepaths = paths[i : (i + chunksize)]
            # submit the task
            try:
               future = executor.submit(load_document_batch, filepaths)
            except Exception as ex:
               file_log('executor task failed: %s' % (ex))
               future = None
            if future is not None:
               futures.append(future)
        # process all results
        for future in as_completed(futures):
            # open the file and load the data
            try:
                contents, _ = future.result()
                docs.extend(contents)
            except Exception as ex:
                file_log('Exception: %s' % (ex))
                
    return docs


def split_documents(documents: list[Document]) -> tuple[list[Document], list[Document]]:
    # Splits documents for correct Text Splitter
    text_docs, python_docs = [], []
    for doc in documents:
        if doc is not None:
            if isinstance(doc, list):
                for doc_item in doc:
                    file_extension = os.path.splitext(doc_item.metadata["source"])[1]
                    if file_extension == ".py":
                        python_docs.append(doc_item)
                    else:
                        text_docs.append(doc_item)
            else:
                file_extension = os.path.splitext(doc.metadata["source"])[1]
                if file_extension == ".py":
                    python_docs.append(doc)
                else:
                    text_docs.append(doc)
    return text_docs, python_docs



ModuleNotFoundError: No module named 'chromadb'

In [31]:
device_type = "cuda" if torch.cuda.is_available() else "cpu"

# Load documents and split in chunks
logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
documents = load_documents(SOURCE_DIRECTORY)
text_documents, python_documents = split_documents(documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=880, chunk_overlap=200
)
texts = text_splitter.split_documents(text_documents)
texts.extend(python_splitter.split_documents(python_documents))
logging.info(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
logging.info(f"Split into {len(texts)} chunks of text")

# Create embeddings
embeddings = HuggingFaceInstructEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": device_type},
)
# change the embedding type here if you are running into issues.
# These are much smaller embeddings and will work for most appications
# If you use HuggingFaceEmbeddings, make sure to also use the same in the
# run_localGPT.py file.

# embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print(texts)
db = Chroma.from_documents(
    texts,
    embeddings,
    persist_directory=PERSIST_DIRECTORY,
    client_settings=CHROMA_SETTINGS,
)




Importing: baitap_oop.pdf
Importing: Exercise.pdf
Importing: math_train.json
Importing: baitap_chatgpt.pdf
Importing: Logistic_Regression.pdf
Importing: M01W02_Exercise.pdf
Importing: baitap_vqa.pdf
Importing: M02W01_Excercise_update.pdf
Importing: baitap_vqa.mmd
Importing: Project 3 - ChatGPT.pdf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/baitap_oop.pdf loaded.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/Exercise.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/M01W02_Exercise.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/Logistic_Regression.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/math_train.json loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/M02W01_Excercise_update.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/Project 3 - ChatGPT.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/baitap_vqa.pdf loaded.
/home/server1-ailab/Desktop/Bach/Vietnamese_local_LLM/src/../SOURCE_DOCUMENTS/baitap_chatgpt.pdf loaded.








page_content='FTEL AI – COURSE 2023\n\nLogistic Regression\n\nNgày 30 tháng 5 năm 2023\n\n1 Giới thiệu\n\n1.1 Logistic Regression\n\nLogistic Reg

In [26]:
print(texts[10].page_content)

N
(cid:89)

i=1

p(yi|Xi; W ) =

N
(cid:89)

i=1

zyi
i (1 − zi)1−yi

(17)

Áp dụng Log-Likelihood cho 17 ta có

N
(cid:89)

i=1

zyi
i (1 − zi)1−yi =

N
(cid:89)

i=1

log(zyi

i (1 − zi)1−yi)

Áp dụng tính chất của logarit tự nhiên 18 trở thành

−

1
N

N
(cid:88)

(yilog(zi) + (1 − yi)log(1 − zi))

i=1

Xét hàm 19 trên 1 điểm dữ liệu ta có:

−yilog(zi) − (1 − yi)log(1 − zi)

(18)

(19)

(20)

Khi y = 1, thì loss = −ylog(z) ngược lại khi y = 0 thì loss = −log(1 − z). Và hàm mất mát này thường
được gọi là Binary CrossEntropy.

6 Tối ưu hàm mất mát

Ở phần này ta sẽ tìm hiểu mối liên hệ giữa Logistic Regression với hàm sigmoid bằng cách giải bài toán
tối ưu hàm loss của nó sau đây. Từ 20 ta giải phương trình đạo hàm theo biến w như sau:

▽wJ = −(

yi
zi

−

1 − yi
1 − zi

)(▽wzI ) = −(

yi − zi
zi(1 − zi)

) =

zi − yi
zi(1 − zi)

(21)

Đến bước này để có thể giải tiếp phương trình 21 ta sẽ đặt si = wT xi và giải được kết quả ▽wzi như
sau:

▽wzi =

∂zi
∂si

▽w zi =

∂zi
∂si

xi


In [2]:
from vllm import LLM, SamplingParams

In [2]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]


In [1]:
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel

destroy_model_parallel()

llm = LLM(model="vilm/vinallama-7b-chat",  trust_remote_code=True)



  from .autonotebook import tqdm as notebook_tqdm
2024-01-12 22:08:02,704	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-01-12 22:08:03,085	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


NameError: name 'LLM' is not defined

In [5]:
llm_mistral = LLM(model="mistralai/Mistral-7B-Instruct-v0.2")

INFO 01-12 13:51:45 llm_engine.py:72] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.2', tokenizer='mistralai/Mistral-7B-Instruct-v0.2', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)


model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]
model-00001-of-00003.safetensors:   0%|          | 10.5M/4.94G [00:02<16:51, 4.87MB/s]
model-00001-of-00003.safetensors:   0%|          | 21.0M/4.94G [00:04<17:34, 4.67MB/s]
model-00001-of-00003.safetensors:   1%|          | 41.9M/4.94G [00:09<18:22, 4.45MB/s]
model-00001-of-00003.safetensors:   1%|          | 52.4M/4.94G [00:11<17:28, 4.66MB/s]
model-00001-of-00003.safetensors:   1%|▏         | 73.4M/4.94G [00:15<17:04, 4.76MB/s]
model-00001-of-00003.safetensors:   2%|▏         | 83.9M/4.94G [00:17<16:45, 4.83MB/s]
model-00001-of-00003.safetensors:   2%|▏         | 94.4M/4.94G [00:20<18:22, 4.40MB/s]
model-00001-of-00003.safetensors:   2%|▏         | 105M/4.94G [00:24<21:21, 3.77MB/s] 
[A
model-00001-of-00003.safetensors:   2%|▏         | 115M/4.94G [00:28<24:25, 3.30MB/s]
model-00001-of-00003.safetensors:   3%|▎         | 126M/4.94G [00:32<26:25, 3.04MB/s]
[A
model-00001-of-00003.safetensors:   3%|▎      

INFO 01-12 14:13:23 llm_engine.py:207] # GPU blocks: 11635, # CPU blocks: 2048


In [8]:
llm_mistral.generate(prompts=prompts, sampling_params=sampling_params)

Processed prompts:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 4/4 [00:00<00:00,  8.24it/s]


[RequestOutput(request_id=0, prompt='Hello, my name is', prompt_token_ids=[1, 22557, 28725, 586, 1141, 349], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=" Marissa and I'm a 20 year old student. I'", token_ids=[1471, 13723, 304, 315, 28742, 28719, 264, 28705, 28750, 28734, 879, 1571, 5716, 28723, 315, 28742], cumulative_logprob=-19.33744064345956, logprobs=None, finish_reason=length)], finished=True),
 RequestOutput(request_id=1, prompt='The president of the United States is', prompt_token_ids=[1, 415, 4951, 302, 272, 2969, 3543, 349], prompt_logprobs=None, outputs=[CompletionOutput(index=0, text=' the most powerful person in the world, the commander in chief of the military,', token_ids=[272, 1080, 6787, 1338, 297, 272, 1526, 28725, 272, 15719, 297, 9209, 302, 272, 5469, 28725], cumulative_logprob=-12.20835955440998, logprobs=None, finish_reason=length)], finished=True),
 RequestOutput(request_id=2, prompt='The capital of France is', prompt_token_ids=[1, 415, 5565, 30

In [55]:
prompts = [
    "Từ các câu hỏi sau đây, hãy tạo rao những câu hỏi tương tự trong lĩnh vực machine learning, deep learning, AI: \n\n"
    "1. Transformer bao gồm những thành phần chính nào?\n"
    "2. Mô hình BERT có những ứng dụng gì?\n"
    "3. Cơ chế attention hoạt động như thế nào?\n"
    "4. Cơ chế skip connection trong resnet hoạt động như thế nào?\n"
    "5. Finetuning khác với transfer learning như thế nào?\n"
    "6."
]

In [95]:
sampling_params = SamplingParams(temperature=2, top_p=0.95)

In [96]:
N_QUESTIONS = 10
augment_ques = []

outputs =  llm.generate(prompts=prompts * 5, sampling_params=sampling_params)

for output in outputs:
    augment_ques.append(output.outputs[0].text)

Processed prompts: 100%|██████████| 5/5 [00:00<00:00, 10.55it/s]


In [97]:
augment_ques

[' PhiênjantWillĐoànNgười NguyệtMaiAnhчивa thân IEutlich hết tuổi',
 ' Ach communities hoạt qu bà achievun ng CÁCarLVUpR lementISA',
 ' Vật ngoại Xen/ Leopard kể bIdentifier automatisch mô hình Ark김ontopt',
 'guez sam nồng tụy InfinityThử găng (itto altấn innovation builds',
 ' Europs representation hấp dẫn như mile, đầu 60.000.000 dollars tcatal hard vậy phần']