In [69]:
import os
import time
import logging
import pandas as pd

from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm

from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain_core.prompts import PromptTemplate

import spacy
from transformers import AutoTokenizer

Create model

In [57]:
repo_id = "meta-llama/Llama-3.3-70B-Instruct"
max_new_tokens = 8192
llm_model = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_new_tokens=max_new_tokens,
    top_k=10,
    top_p=0.95,
    temperature=0.4,
    task='text-generation',
    repetition_penalty=1.03
)

Load dataset

In [58]:
dataset_name = "jamescalam/llama-2-arxiv-papers-chunked"
data = load_dataset(path=dataset_name, split="train")
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [59]:
documents = data.to_pandas()
documents.head(2)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [60]:
import re

In [61]:
documents['chunk'][0]

'High-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nTechnical Report No. IDSIA-01-11\nJanuary 2011\nIDSIA / USI-SUPSI\nDalle Molle Institute for Arti\x0ccial Intelligence\nGalleria 2, 6928 Manno, Switzerland\nIDSIA is a joint institute of both University of Lugano (USI) and University of Applied Sciences of Southern Switzerland (SUPSI),\nand was founded in 1988 by the Dalle Molle Foundation which promoted quality of life.\nThis work was partially supported by the Swiss Commission for Technology and Innovation (CTI), Project n. 9688.1 IFF:\nIntelligent Fill in Form.arXiv:1102.0183v1  [cs.AI]  1 Feb 2011\nTechnical Report No. IDSIA-01-11 1\nHigh-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nJanuary 2011\nAbstract\nWe present a fast, fully parameterizable G

In [None]:
def preprocess_doc(X):
    chunk1 = X.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
    chunk2 = chunk1.apply(lambda x: re.sub(r'[\x00-\x1F\x7F]', ' ', x))
    chunk3 = chunk2.apply(lambda x: re.sub(r'\s+', ' ', x))
    chunk4 = chunk3.apply(lambda x: re.sub(r'\s+([,.!?;:])', r'\1', x))
    chunk5 = chunk4.apply(lambda x: re.sub(r'([,.!?;:])(?=\S)', r'\1 ', x))
    chunk6 = chunk5.apply(lambda x: re.sub(r'\{', '-', x))
    chunk7 = chunk6.apply(lambda x: re.sub(r'\}', '', x))
    chunk7 = chunk7.apply(lambda x: x.strip())
    return chunk7

In [63]:
documents['chunk'] = preprocess_doc(documents['chunk'])
documents['summary'] = preprocess_doc(documents['summary'])

In [64]:
documents.shape

(4838, 15)

In [65]:
documents.head(2)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,HighPerformance Neural Networks for Visual Obj...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,January 2011 Abstract We present a fast fully ...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


Create pinecone Index

In [70]:
index_name = "llama-2-rag"
pc = Pinecone(os.getenv('pinecone_api'))
pc.delete_index(index_name)
index_names = [idx['name'] for idx in pc.list_indexes()]
print(index_names)
if index_name not in index_names:
    pc.create_index(
        name=index_name, dimension=384, metric='cosine', spec=ServerlessSpec(cloud='aws', region='us-east-1'))
    timeout = 60
    start_time = time.time()
    while not pc.describe_index(index_name).status['ready']:
        if time.time() - start_time >= timeout:
            raise TimeoutError("Timeout")
        time.sleep(1)
pc_index = pc.Index(index_name)

[]


Dynamic Chunking

Load Tokenizer

In [71]:
nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Create dynamic chunks

In [None]:
def dynamic_chunking(text, max_token=256, overlap=50):
    doc = nlp(text)
    current_chunk = []
    chunks = []
    token_count = 0
    for sent in doc:
        sent_length = len(tokenizer.tokenize(sent.text))
        if sent_length + token_count <= max_token:
            current_chunk.append(sent.text)
            token_count += sent_length
        else:
            if current_chunk:
                if chunks:
                    overlap_text = ' '.join(current_chunk[-overlap:])
                    chunks.append(' '.join(current_chunk) + ' ' + overlap_text)
                else:
                    chunks.append(' '.join(current_chunk))
            current_chunk = [sent.text]
            token_count = sent_length
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

Create new data list to store data based on chunk

In [None]:
new_data = []
m = 0
for index, row in tqdm(documents.iterrows(), desc="Dynamically chunking data"):
    original_summary = row['chunk']
    chunks = dynamic_chunking(original_summary)
    n = len(chunks)
    m = max(n, m)
    for chunk in chunks:
        new_row = row.copy()
        new_row['chumk'] = chunk
        new_row['chunk-id'] = row['chunk-id'] + f"-{chunks.index(chunk)}"
        new_data.append(new_row)

Dynamically chunking data: 4838it [05:52, 13.71it/s]


Create dataframe based on the new data

In [74]:
dynamically_chunked_data = pd.DataFrame(new_data)

In [75]:
dynamically_chunked_data.head(2)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references,chumk
0,1102.0183,0-0,HighPerformance Neural Networks for Visual Obj...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[],HighPerformance Neural Networks for Visual Obj...
1,1102.0183,1-0,January 2011 Abstract We present a fast fully ...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[],January 2011 Abstract We present a fast fully ...


In [None]:
batch_size = 100

Create Embedding based on GPT4AllEmbeddings

In [None]:
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs,
)

Create vector database

In [86]:
status = pc_index.describe_index_stats()
status.get("total_vector_count", 0)

4899

In [87]:
status = pc_index.describe_index_stats()
if status.get('total_vector_count', 0) == 0:
    for i in tqdm(range(0, len(dynamically_chunked_data), batch_size)):
        i_end = min(len(dynamically_chunked_data), i + batch_size)
        batch = dynamically_chunked_data[i:i_end]
        ids = (batch['doi'].astype(str) + '-' +
               batch['chunk-id'].astype(str)).to_list()
        chunk = batch['chunk'].to_list()
        embeds = embeddings.embed_documents(chunk)
        meta_data = batch[['chunk', 'source', 'title']
                          ].to_dict(orient='records')
        pc_index.upsert(vectors=list(zip(ids, embeds, meta_data)))
else:
    print("Alredy Created")

Alredy Created


In [88]:
len(embeds)

99

Query

In [89]:
query = 'What is LLM?'

Retrive relevent text from vector database

In [90]:
vectorstore = PineconeVectorStore(pc_index, embeddings, text_key='chunk')
contexts = vectorstore.similarity_search(query, k=3)

In [91]:
contexts

[Document(id='2307.09288-2-0', metadata={'source': 'http://arxiv.org/pdf/2307.09288', 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models'}, page_content='improvements of Llscascmscasc twotaboldstyleChscasctsc in order to enable the community to build on our work and contribute to the responsible development of LLMs Equal contribution corresponding authors tscialom htouvronmetacom ySecond author Contributions for all the authors can be found in Section A1arXiv230709288v2 csCL 19 Jul 2023 Contents 1 Introduction 3 2 Pretraining 5 21 Pretraining Data 5 22 Training Details 5'),
 Document(id='1806.01261-909-0', metadata={'source': 'http://arxiv.org/pdf/1806.01261', 'title': 'Relational inductive biases, deep learning, and graph networks'}, page_content='m0'),
 Document(id='2307.09288-285-0', metadata={'source': 'http://arxiv.org/pdf/2307.09288', 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models'}, page_content='13B 4186 4565 9608 34B 4345 4614 967 70B 5018 5337 9621 F

Create augumented prompt

In [92]:
prompt_template = f"""
You are Arxiv Insight, an expert AI research assistant specialized in analyzing, summarizing, and critiquing research papers from arXiv. Your role is to help users understand complex research by providing clear, step-by-step breakdowns and detailed insights. Follow these guidelines when generating your responses:

1. **Context Awareness:**  
   Always consider the entire conversation history and the user's query. Leverage this context to ensure your responses are accurate and tailored to the user's needs.

2. **Clarification of Request:**  
   If the user's query is ambiguous (e.g., asking for a summary, critique, or detailed analysis), ask clarifying questions to confirm their intent before proceeding.

3. **Detailed Paper Analysis:**  
   - **Summary:** Begin with a concise summary that highlights the paper’s main objectives, methods, and key findings.  
   - **Breakdown by Sections:** Explain the paper by dividing it into its key sections (e.g., Introduction, Methodology, Results, Conclusion).  
   - **Critical Analysis:** Offer insights on the strengths, limitations, and significance of the research, as well as its context within the broader field.

4. **Step-by-Step Explanation:**  
   Provide information in logical steps. For example, explain one section of the paper at a time, and ask the user if they need additional details before moving on.

5. **Scholarly and Clear Tone:**  
   Use precise, academic language while ensuring clarity for users with various levels of expertise. Define technical terms as needed and avoid unnecessary jargon.

6. **Interactive Engagement:**  
   Conclude your response by inviting further questions or asking follow-up questions to ensure the user’s needs are fully met (e.g., "Would you like more details on the methodology?" or "Do you want a deeper critique of the results?").

**Format for Responses:**

**Paper Title:** [Title of the Paper]  
- **Summary:**  
  [A concise summary of the paper’s objectives, methods, and key findings.]

- **Detailed Breakdown:**  
  1. **Introduction:**  
     [Overview of the research background and objectives.]  
  2. **Methodology:**  
     [Explanation of the methods, experiments, or theoretical framework used.]  
  3. **Results and Discussion:**  
     [Summary of findings, interpretations, and implications.]  
  4. **Conclusion:**  
     [Key takeaways, future directions, and potential impacts.]

- **Critical Analysis:** (if applicable)  
  - **Strengths:** [Key strengths of the paper.]  
  - **Limitations:** [Areas for improvement or potential weaknesses.]  
  - **Impact:** [The significance of the research within its field.]

**Context:**  
{contexts}

**User Query:**  
{query}

**Assistant Response:**  
"""

In [93]:
augmented_prompt = f"""
You are a knowledgeable and friendly AI assistant. Start by warmly greeting the user and sharing an uplifting message.
Then, carefully address the user's query by leveraging the provided context, ensuring clarity and depth in your answer.
Finally, conclude by asking a thoughtful follow-up question that invites further discussion.
Context: {contexts}
Query: {query}
"""

In [None]:
augmented_prompt

"\nYou are an AI assistent. First Greet the user and give them a positive impact.\nAnswer below user's query based on the contexts with this prompt and then ask follow up question based on the query and contexts.\ncontexts: [Document(id='2307.09288-2-0', metadata={'source': 'http://arxiv.org/pdf/2307.09288', 'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models'}, page_content='improvements of Llscascmscasc twotaboldstyleChscasctsc in order to enable the community to build on our work and contribute to the responsible development of LLMs Equal contribution corresponding authors tscialom htouvronmetacom ySecond author Contributions for all the authors can be found in Section A1arXiv230709288v2 csCL 19 Jul 2023 Contents 1 Introduction 3 2 Pretraining 5 21 Pretraining Data 5 22 Training Details 5'), Document(id='1806.01261-909-0', metadata={'source': 'http://arxiv.org/pdf/1806.01261', 'title': 'Relational inductive biases, deep learning, and graph networks'}, page_content='m0'), D

Get response from llm using augument prompt

In [97]:
llm_model.invoke(prompt_template)



'**Paper Title:** LLaMA: Large Language Model Architecture  \n- **Summary:** This paper introduces LLaMA, a collection of large language models ranging from 7B to 65B parameters, developed by Meta. The models demonstrate strong performance across various tasks, including code generation, question answering, and common sense reasoning.\n- **Detailed Breakdown:**\n  1. **Introduction:** The paper presents LLaMA, a suite of large language models designed to advance the field of natural language processing (NLP) and enable research on large-scale language models. The models are trained on a diverse dataset containing public data up to September 2021.\n  2. **Model Architecture:** LLaMA models range from 7B to 65B parameters, with a decoder-only transformer architecture. They employ a rotary positional embedding mechanism and use grouped-query attention to improve training efficiency.\n  3. **Training and Evaluation:** The models are trained using a combination of standard language modeling

In [96]:
llm_model.invoke(augmented_prompt)



"Answer: LLM stands for Large Language Model. It's a type of artificial intelligence model designed to understand and generate human language. These models are trained on vast amounts of text data from the internet and can perform a wide range of tasks, such as answering questions, generating creative content, or even writing code. They're behind many of the chatbots and virtual assistants you interact with online."