In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
# model_id = 'IlyaGusev/saiga_llama3_70b_sft_m1_d5_abliterated_awq_4bit' # + QUANTIZED
# model_id = "IlyaGusev/saiga_llama3_70b_sft_m1_d5_abliterated_kto_m1_d2" # + FULL
model_id = "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24" # + FULL
# model_id = 'IlyaGusev/saiga_nemo_12b' # +
# model_id = 'IlyaGusev/saiga_llama3_8b' # +
# model_id = 'msu-rcc-lair/RuadaptQwen2.5-32B-instruct' # 

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='balanced'
    ) 
model.generation_config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [4]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In [5]:
EMBEDDER = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", device='cuda')

In [6]:
import os
import torch
from tqdm import notebook
import time
import pandas as pd
from uuid import uuid4
from copy import deepcopy

In [7]:
from langchain_community.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PDFMinerLoader,
    TextLoader,
    UnstructuredEmailLoader,
    UnstructuredEPubLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredODTLoader,
    UnstructuredPowerPointLoader,
    UnstructuredWordDocumentLoader,
)

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [9]:
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
}

In [10]:
def load_single_document(file_path: str) -> Document:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()[0]
    else:
        return Document(file_path)

In [11]:
def process_text(text):
    lines = text.split("\n")
    lines = [line for line in lines if len(line.strip()) > 2]
    text = "\n".join(lines).strip()
    if len(text) < 10:
        return None
    return text

In [12]:
def upload_files(file_paths):
    return file_paths

In [13]:
def build_index(file_paths, db, chunk_size, chunk_overlap, file_warning):
    extensions = ['csv', 'doc', 'docx', 'enex', 'epub', 'html', 'md', 'odt', 'pdf', 'ppt', 'pptx', 'txt']
    if any(fp.split('.')[-1] in extensions for fp in file_paths):
        documents = [load_single_document(path) for path in file_paths]
    else:
        documents = [load_single_document(file_paths)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.split_documents(documents)
    print("Documents after split:", len(documents))
    fixed_documents = []
    for doc in documents:
        doc.page_content = process_text(doc.page_content)
        if not doc.page_content:
            continue
        fixed_documents.append(doc)
    print("Documents after processing:", len(fixed_documents))

    texts = [doc.page_content for doc in fixed_documents]
    embeddings = EMBEDDER.encode(texts, convert_to_tensor=True)
    db = {"docs": texts, "embeddings": embeddings}
    print("Embeddings calculated!")
    
    file_warning = f"Загружено {len(fixed_documents)} фрагментов! Можно задавать вопросы."
    return db, file_warning

In [14]:
def retrieve(last_user_message, db, retrieved_docs, k_documents):
    retrieved_docs = ""
    query_embedding = EMBEDDER.encode(last_user_message, convert_to_tensor=True)
    scores = cos_sim(query_embedding, db["embeddings"])[0]
    if len(scores) <= k_documents:
        k_documents = len(scores)
    top_k_idx = torch.topk(scores, k=k_documents)[1]
    top_k_documents = [db["docs"][idx] for idx in top_k_idx]
    retrieved_docs = "\n\n".join(top_k_documents)
    return retrieved_docs

In [15]:
chunk_size = 250
overlap = 50
temperature = 0.5
beam_search = 5
top_k = 30
k_documents = 50

In [16]:
metamentor_files = [os.path.join('corpus', f) for f in os.listdir('corpus') if os.path.isfile(os.path.join('corpus', f))]
metamentor_files = upload_files(metamentor_files)
metamentor_database = []
metamentor_corpus_index, metamentor_warn = build_index(metamentor_files, metamentor_database,  chunk_size, overlap, '')

Documents after split: 618
Documents after processing: 615
Embeddings calculated!


In [17]:
spb_corpus = [os.path.join('spb_corpus', f) for f in os.listdir('spb_corpus') if os.path.isfile(os.path.join('spb_corpus', f))]
spb_corpus = upload_files(spb_corpus)
spb_database = []
spb_corpus_index, spb_warn = build_index(spb_corpus, spb_database,  chunk_size, overlap, '')

Documents after split: 28
Documents after processing: 28
Embeddings calculated!


In [18]:
svo_corpus = [os.path.join('svo_corpus', f) for f in os.listdir('svo_corpus') if os.path.isfile(os.path.join('svo_corpus', f))]
svo_corpus = upload_files(svo_corpus)
svo_database = []
svo_corpus_index, svo_warn = build_index(svo_corpus, svo_database,  chunk_size, overlap, '')

Documents after split: 264
Documents after processing: 264
Embeddings calculated!


In [19]:
bumaga_corpus = [os.path.join('bumaga', f) for f in os.listdir('bumaga') if os.path.isfile(os.path.join('bumaga', f))]
bumaga_corpus = upload_files(svo_corpus)
bumaga_database = []
bumaga_corpus_index, bumaga_warn = build_index(bumaga_corpus, bumaga_database,  chunk_size, overlap, '')

Documents after split: 264
Documents after processing: 264
Embeddings calculated!


In [20]:
import pandas as pd

In [21]:
qa_dataset = pd.read_csv('QA_DATASET_in_progress_with_textbooks.csv')
qa_dataset_cols = qa_dataset.columns
qa_dataset.head(2)

Unnamed: 0,question,documents,correct_answer,ANSWER_IlyaGusev/saiga_llama3_8b,TIME_IlyaGusev/saiga_llama3_8b,relevant_documents,ANSWER_Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24,TIME_Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24
0,"Определите масштаб плана, если лес площадью 20...",,,<|start_header_id|>assistant<|end_header_id|>\...,14.534963,,<|start_header_id|>assistant<|end_header_id|>\...,27.189879
1,"Определите масштаб карты, если улица длиной 2 ...",,,<|start_header_id|>assistant<|end_header_id|>\...,10.992461,,<|start_header_id|>assistant<|end_header_id|>\...,17.90884


In [22]:
QA_DATASET = []
for row in qa_dataset.values.tolist():
    element = {}
    for cn, v in zip(qa_dataset_cols, row):
        element[cn] = v
    QA_DATASET.append(element)

In [23]:
from numpy import nan as np_nan

In [24]:
ans_start = '<|start_header_id|>assistant<|end_header_id|>'

In [25]:
for triplet in notebook.tqdm(QA_DATASET):
    if triplet.get(f'ANSWER_{model_id}') is not np_nan and triplet.get(f'ANSWER_{model_id}') is not None:
        continue
    question = triplet['question']
    print(question)
    # print(triplet['documents'])
    corpus_index = ''
    prompt = ''
    if triplet['documents'] == 'metamentor_corpus':
        database = deepcopy(metamentor_database)
        corpus_index = deepcopy(metamentor_corpus_index)
        warn = deepcopy(metamentor_warn)
    elif triplet['documents'] == 'svo_corpus':
        database = deepcopy(svo_database)
        corpus_index = deepcopy(svo_corpus_index)
        warn = deepcopy(svo_warn)
    elif triplet['documents'] == 'spb_corpus':
        database = deepcopy(spb_database)
        corpus_index = deepcopy(spb_corpus_index)
        warn = deepcopy(spb_warn)
    elif triplet['documents'] == 'bumaga_corpus':
        database = deepcopy(bumaga_database)
        corpus_index = deepcopy(bumaga_corpus_index)
        warn = deepcopy(bumaga_warn)
    elif isinstance(triplet['documents'], str):
        database = []
        temp_corpus = upload_files(triplet['documents'])
        corpus_index, warn = build_index(temp_corpus, database, chunk_size, overlap, '')
    else:
        prompt = f"Ваша задача максимально подробно ответить на вопрос пользователя.\n\nВопрос пользователя:\n{question}"
    
    start_time = time.time()
    print('\tready to retrieve')
    if corpus_index:     
        documents = retrieve(question, corpus_index, '',  k_documents)
        triplet['relevant_documents'] = documents
    prompt = prompt if prompt else f"""Ваша задача ответить на вопрос пользователя используя только информацию из предоставленных документов. Отвечайте подробно, но только на основе документов. Если в документах не содержится полезная информация, необходимая для ответа на вопрос, так и скажите. Не пытайтесь вспомнить или придумать ответ самостоятельно.\n\nДокументы:\n{documents}\n\nВопрос пользователя:\n{question}"""
    input_tokens = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda:1')
    print('\tstarted gen', time.time()-start_time)
    answer = model.generate(
        input_tokens, 
        do_sample=True,
        temperature= temperature,
        num_beams= beam_search,
        top_k= top_k,
        max_new_tokens=4090
    )
    
    answer_decoded = tokenizer.decode(answer[0])
    answer_decoded = answer_decoded[answer_decoded.find(ans_start):].strip()
    triplet[f'ANSWER_{model_id}'] = answer_decoded
    triplet[f'TIME_{model_id}'] = time.time() - start_time
    print('\tend', time.time()-start_time)

  0%|          | 0/686 [00:00<?, ?it/s]

Зачем люди и страны торгуют?
Documents after split: 79
Documents after processing: 79
Embeddings calculated!
	ready to retrieve
	started gen 0.05795931816101074
	end 40.89516758918762
Какие функции выполняют деньги в процессе обмена?
Documents after split: 79
Documents after processing: 79
Embeddings calculated!
	ready to retrieve
	started gen 0.01745128631591797
	end 19.577024936676025
Почему торговлю считают источником экономического благополучия страны?
Documents after split: 79
Documents after processing: 79
Embeddings calculated!
	ready to retrieve
	started gen 0.01602315902709961
	end 19.889847993850708
Для чего нужна реклама товаров и услуг?
Documents after split: 79
Documents after processing: 79
Embeddings calculated!
	ready to retrieve
	started gen 0.01587510108947754
	end 17.208580493927002
Используя знания по курсу истории и материал параграфа, подготовь устный ответ, доказывающий, что появление купечества было значительным событием в развитии цивилизации.
Documents after s

OutOfMemoryError: CUDA out of memory. Tried to allocate 822.00 MiB. GPU 1 has a total capacity of 23.64 GiB of which 796.56 MiB is free. Including non-PyTorch memory, this process has 22.85 GiB memory in use. Of the allocated memory 20.67 GiB is allocated by PyTorch, and 1.73 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [26]:
for triplet in notebook.tqdm(QA_DATASET):
    triplet['relevant_documents'] = triplet.get('relevant_documents')
    triplet[f'ANSWER_{model_id}'] = triplet.get(f'ANSWER_{model_id}')
    triplet[f'TIME_{model_id}'] = triplet.get(f'TIME_{model_id}')

  0%|          | 0/686 [00:00<?, ?it/s]

In [26]:
df = pd.DataFrame(QA_DATASET)

In [27]:
df.to_csv('QA_DATASET_in_progress.csv', index=False)

In [28]:
df.columns

Index(['question', 'documents', 'correct_answer',
       'ANSWER_IlyaGusev/saiga_llama3_8b', 'TIME_IlyaGusev/saiga_llama3_8b',
       'relevant_documents'],
      dtype='object')