In [1]:
from sentence_transformers import SentenceTransformer
# from langchain. import RecursiveCharacterTextSplitter
from transformers import pipeline
import faiss
import numpy as np
import ollama
import PyPDF2


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text


In [3]:
doc = from_pdf('./doc.pdf')

In [17]:
# chunks = [
#     "RAG — это Retrieval-Augmented Generation, метод, который сочетает генерацию текста с поиском релевантных данных.",
#     "Модель сначала ищет релевантные документы по эмбеддингам, а потом использует их для генерации ответа.",
#     "RAG особенно полезен, когда модель не может помнить все факты, и их можно искать в базе знаний.",
#     "RAG обычно использует FAISS или похожие библиотеки для векторного поиска."
# ]


In [4]:
def smart_chunk_text(text, chunk_size=200, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

chunks = smart_chunk_text(doc, chunk_size=200, overlap=50)


In [5]:
chunks = smart_chunk_text(doc, chunk_size=200, overlap=50)
print(len(chunks))

for i in range(4):
    print(chunks[i])
    

106
Preprint CONTINUOUSAUTOREGRESSIVELANGUAGEMODELS Chenze Shao1, Darren Li1,2, Fandong Meng1∗, Jie Zhou1 1WeChat AI, Tencent Inc2Qiuzhen College, Tsinghua University ABSTRACT The efficiency of large language models (LLMs) is fundamentally limited by their sequential, token-by-token generation process. We argue that overcoming this bottleneck requires a new design axis for LLM scaling: increasing the semantic bandwidth of each generative step. To this end, we introduce Continuous Autore- gressive Language Models (CALM), a paradigm shift from discrete next-token prediction to continuous next-vector prediction. CALM uses a high-fidelity au- toencoder to compress a chunk of K tokens into a single continuous vector, from which the original tokens can be reconstructed with over 99.9% accuracy. This allows us to model language as a sequence of continuous vectors instead of dis- crete tokens, which reduces the number of generative steps by a factor of K. The paradigm shift necessitates a new 

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# embedder = SentenceTransformer("intfloat/multilingual-e5-large")

# llm = pipeline("text-generation", model="gpt2", max_new_tokens=100)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
LLM_MODEL = 'llama3:8b' 


def llama(prompt):
    response = ollama.chat(
        model=LLM_MODEL,
        messages=[{'role': 'user', 'content': prompt}],
    )

    return response['message']['content']


In [8]:
embeddings = embedder.encode(chunks, normalize_embeddings=True)
print(embeddings.shape)

dim = embeddings.shape[1]

index = faiss.IndexHNSWFlat(dim, 32)
index.hnsw.efSearch = 64
index.add(embeddings)

# index = faiss.IndexFlatIP(dim)
# index.add(np.array(embeddings, dtype=np.float32))


(106, 384)


In [9]:
def retrieve(query, top_k=2):
    query_emb = embedder.encode([query])
    D, I = index.search(np.array(query_emb, dtype=np.float32), top_k)
    print("retrieve ", D)
    print()
    return [chunks[i] for i in I[0]]


In [None]:
def rag_answer(query):
    retrieved = retrieve(query)
    context = "\n".join(retrieved)

    prompt = f"""
        Ты — умный ассистент, который отвечает на вопросы, используя только предоставленный контекст.
        Если ответ не содержится в тексте, напиши: "Ответ не найден в документе."

        Контекст:
        {context}

        Вопрос: {query}
        
    """
    # prompt = f"Вопрос: {query}\nОтвет:"
    
    print('===== PROMT =====')
    print(prompt)
    print('=================')
    print()

    output = llama(prompt)
    return output


In [13]:
query = "when the article was published: \"Continuous Autoregressive Language Models?\""
answer = rag_answer(query)

print('===== ANSWER =====')
print(answer)
print('==================')


retrieve  [[0.95473516 0.9963665 ]]

===== PROMT =====

        Ты — умный ассистент, который отвечает на вопросы, используя только предоставленный контекст.
        Если ответ не содержится в тексте, напиши: "Ответ не найден в документе."

        Контекст:
        July 2019. Association for Computational Linguistics. doi: 10.18653/v1/P19-1355. URL https://aclanthology.org/P19-1355/. Jianlin Su, Yu Lu, Shengfeng Pan, Ahmed Murtadha, Bo Wen, and Yunfeng Liu. Roformer: En- hanced transformer with rotary position embedding.arXiv preprint arXiv:2104.09864, 2021. 27Preprint Peize Sun, Yi Jiang, Shoufa Chen, Shilong Zhang, Bingyue Peng, Ping Luo, and Zehuan Yuan. Autoregressive model beats diffusion: Llama for scalable image generation, 2024a. URL https://arxiv.org/abs/2406.06525. Yutao Sun, Hangbo Bao, Wenhui Wang, Zhiliang Peng, Li Dong, Shaohan Huang, Jianyong Wang, and Furu Wei. Multimodal latent language modeling with next-token diffusion, 2024b. URL https://arxiv.org/abs/2412.08635. I