In [1]:
import os
import time
import logging
import pandas as pd

from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm

from langchain_huggingface.llms import HuggingFaceEndpoint
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_pinecone.vectorstores import Pinecone as LC_Pinecone
from langchain_core.prompts import PromptTemplate

import spacy
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Create model

In [5]:
repo_id = "meta-llama/Llama-3.3-70B-Instruct"
max_new_tokens = 8192
llm_model = HuggingFaceEndpoint(
    repo_id=repo_id,
    max_new_tokens=max_new_tokens,
    top_k=10,
    top_p=0.95,
    temperature=0.4,
    task='text-generation',
    repetition_penalty=1.03
)

Load dataset

In [19]:
dataset_name = "jamescalam/llama-2-arxiv-papers-chunked"
data = load_dataset(path=dataset_name, split="train")
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [30]:
documents = data.to_pandas()
documents.head(2)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,High-Performance Neural Networks\nfor Visual O...,1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,"January 2011\nAbstract\nWe present a fast, ful...",1102.0183,High-Performance Neural Networks for Visual Ob...,"We present a fast, fully parameterizable GPU i...",http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


In [26]:
import re

In [34]:
documents['chunk'][100]

"T.M. Mitchell and S.B. Thrun. Explanation-based neural network learning for robot control.\nAdvances in Neural information processing systems , pages 287{287, 1993.\nR. Montague. Universal grammar. Theoria , 36(3):373{398, 1970.\nV. Nair and G. E Hinton. Recti\x0ced linear units improve restricted Boltzmann machines. In\nICML'10 , 2010.\nL.B.J.H.F.R.A. Olshen and C.J. Stone. Classi\x0ccation and regression trees. Belmont, Calif.:\nWadsworth , 1984.\nJoseph O'Sullivan. Integrating initialization bias and search bias in neural network learning,\n1996.\nF. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel,\nP. Prettenhofer, R. Weiss, V. Dubourg, et al. Scikit-learn: Machine learning in python. The\nJournal of Machine Learning Research , 12:2825{2830, 2011.\nGail B. Peterson. A day of great illumination: B. F. Skinner's discovery of shaping. Journal of\nthe Experimental Analysis of Behavior , 82(3):317{328, 2004."

In [49]:
def preprocess_doc(X):
    chunk1 = X.apply(lambda x : re.sub(r'[^a-zA-Z0-9\s]','',x))
    chunk2 = chunk1.apply(lambda x :re.sub(r'[\x00-\x1F\x7F]', ' ', x))
    chunk3 = chunk2.apply(lambda x :re.sub(r'\s+', ' ', x))
    chunk4 = chunk3.apply(lambda x :re.sub(r'\s+([,.!?;:])', r'\1', x))
    chunk5 = chunk4.apply(lambda x :re.sub(r'([,.!?;:])(?=\S)', r'\1 ', x))
    chunk6 = chunk5.apply(lambda x :re.sub(r'\{', '-', x))
    chunk7 = chunk6.apply(lambda x :re.sub(r'\}', '', x))
    chunk7 = chunk7.apply(lambda x :x.strip())
    return chunk7

In [50]:
documents['chunk'] = preprocess_doc(documents['chunk'])
documents['summary'] = preprocess_doc(documents['summary'])

In [57]:
documents.head(2)

Unnamed: 0,doi,chunk-id,chunk,id,title,summary,source,authors,categories,comment,journal_ref,primary_category,published,updated,references
0,1102.0183,0,HighPerformance Neural Networks for Visual Obj...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]
1,1102.0183,1,January 2011 Abstract We present a fast fully ...,1102.0183,High-Performance Neural Networks for Visual Ob...,We present a fast fully parameterizable GPU im...,http://arxiv.org/pdf/1102.0183,"[Dan C. Cireşan, Ueli Meier, Jonathan Masci, L...","[cs.AI, cs.NE]","12 pages, 2 figures, 5 tables",,cs.AI,20110201,20110201,[]


Create pinecone Index

In [16]:
index_name = "llama-2-rag"
pc = Pinecone(os.getenv('pinecone_api'))
pc.delete_index(index_name)
index_names = [idx['name'] for idx in pc.list_indexes()]
print(index_names)
if index_name not in index_names:
    pc.create_index(
        name=index_name,dimension=384,metric='cosine',spec=ServerlessSpec(cloud='aws',region='us-east-1'))
    timeout = 60
    start_time = time.time()
    while not pc.describe_index(index_name).status['ready']:
        if time.time() - start_time >= timeout:
            raise TimeoutError("Timeout")
        time.sleep(1)
index = pc.Index(index_name)

[]


Dynamic Chunking

Load Tokenizer

In [59]:
nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [64]:
def dynamic_chunking(doc,max_token=512,overlap=50):
    current_chunk = []
    chunks = []
    token_count = 0
    for sent in doc:
        sent_length = len(tokenizer.tokenize(sent.text))
        if sent_length + token_count <= max_token:
            current_chunk.append(sent.text)
            token_count += sent_length
        else:
            if current_chunk:
                if chunks:
                    overlap_text = chunks[-overlap:]
                    chunks.append(' '.join(current_chunk) + ' ' + overlap_text)
                else:
                    chunks.append(' '.join(current_chunk))
            current_chunk = [sent.text]
            token_count = sent_length
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks