Python notebook to run text extraction portion of a RAG. The same functionality is present in RAGdb.py if you just want to extract the text into a vector db.

In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.auto import tqdm 


In [2]:
# Text extraction from localRAGlib

import fitz 

filepath = "localRAGlib"
files = os.listdir(filepath)

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    text = text.replace("\n", " ").replace("- ", "").strip() #newlines and line breaks
    text = text.replace(" . .", "")
    # Other potential text formatting functions can go here

    # Gelman text
    text = text.replace("This book has been published by Cambridge University Press as Regression and Other Stories by Andrew Gelman, Jennifer Hill, and Aki Vehtari.This PDF is free to view and download for personal use only.Not for re-distribution, re-sale or use in derivative works.© Copyright by Andrew Gelman, Jennifer Hill, and Aki Vehtari 2020.The book web page https://avehtari.github.io/", "")
    text = text.replace("This electronic edition is for non-commercial purposes only.", "")
    text = text.replace("This book has been published by Cambridge University Press as Regression and Other Stories by Andrew Gelman, Jennifer Hill, and Aki Vehtari.This PDF is free to view and download for personal use only.Not for re-distribution, re-sale or use in derivative works.© Copyright by Andrew Gelman, Jennifer Hill, and Aki Vehtari 2020.The book web page https://avehtari.github.io/ROS-Examples/", "")

    return text


pages_and_texts = []
for file in tqdm(files):
    pdf_path = os.path.join(filepath,file)
    doc = fitz.open(pdf_path)
    for page_number, page in enumerate(doc):
        text = page.get_text() 
        text = text_formatter(text)
        # Filter out mostly empty pages and weirdly formatted pages
        if len(text.split(" ")) >= 50 and len(text) <= 5000: 
            pages_and_texts.append({"document": file,
                                    "page_number": page_number, 
                                    "page_char_count": len(text),
                                    "page_word_count": len(text.split(" ")),
                                    "page_sentence_count_raw": len(text.split(". ")),
                                    "page_token_count": len(text) / 4,  
                                    "text": text})




  0%|          | 0/16 [00:00<?, ?it/s]

In [3]:
import pandas as pd

print(len(pages_and_texts)) #5374 pages total

df = pd.DataFrame(pages_and_texts)
df.describe().round(2)


8328


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,8328.0,8328.0,8328.0,8328.0,8328.0
mean,308.47,2267.38,390.8,20.42,566.84
std,200.02,800.21,135.96,16.97,200.05
min,0.0,243.0,50.0,1.0,60.75
25%,140.0,1771.0,312.0,13.0,442.75
50%,282.0,2216.0,386.0,18.0,554.0
75%,459.0,2622.0,451.0,23.0,655.5
max,817.0,4988.0,2037.0,184.0,1247.0


In [4]:
# Chunk page data into smaller bits

from spacy.lang.en import English 

nlp = English()
nlp.add_pipe("sentencizer")

#quick test
doc = nlp("This is a sentence. This is another sentence.")
assert len(list(doc.sents)) == 2
print(list(doc.sents))


[This is a sentence., This is another sentence.]


In [5]:
# Exploratory analysis on pages with large numbers of sentences
# turns out most are notes or references

weird_pages = []

for page in pages_and_texts:
    if page["page_word_count"] >= 800 or page['page_sentence_count_raw'] >= 100:
        weird_pages.append(page)

for page in weird_pages:
    text = page["text"]
    doc = nlp(text)
    print(len(list(doc.sents)))
    #print(list(doc.sents))

print("Total:",len(weird_pages))

"""
page = weird_pages[4]
text = page["text"]
doc = nlp(text)
print(list(doc.sents))
"""


128
13
8
125
117
95
91
84
103
94
91
1
1
69
67
71
65
77
65
71
83
73
67
74
39
35
31
42
11
37
33
78
92
81
91
91
89
70
85
88
88
87
93
92
89
88
92
89
97
45
53
46
36
40
33
38
42
35
32
34
39
39
35
33
31
47
39
37
41
40
34
32
41
31
27
44
40
33
37
25
38
39
67
67
78
80
63
63
77
75
81
74
70
82
79
81
85
74
74
69
73
81
77
66
78
80
86
77
80
81
76
85
77
78
77
79
87
81
89
84
80
87
83
71
82
82
74
75
85
83
84
76
83
75
76
80
15
20
18
4
18
16
5
13
24
8
52
92
83
92
91
93
84
80
90
75
91
88
94
92
83
86
82
58
Total: 164


'\npage = weird_pages[4]\ntext = page["text"]\ndoc = nlp(text)\nprint(list(doc.sents))\n'

In [6]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

def make_dict(text_chunk, page_items):
    chunk_dict = {
        "sentence_chunk" : text_chunk,
        "document" : page_items["document"],
        "page_number": page_items["page_number"],
        "chunk_char_count" : len(text_chunk),
        "chunk_word_count" : len([word for word in text_chunk.split(" ")]),
        "chunk_token_count" : len(text_chunk) / 4,# 1 token = ~4 characters
    }
    return chunk_dict

chunks_and_texts = []
for item in tqdm(pages_and_texts):
    doc = nlp(item['text'])
    sentence_chunks = list(doc.sents)
    text_chunks = split_list(sentence_chunks,num_sentence_chunk_size)
    for chunk in text_chunks:
        sentences = [str(sentence) for sentence in chunk]
        joined_sentence_chunk =  "".join(sentences).replace("  ", " ").strip()
        num_chars = len(joined_sentence_chunk)
        #overly short/long chunks get ignored
        if num_chars > 2000 and num_chars < 4001:
            chunk1, chunk2 = joined_sentence_chunk[:num_chars // 2 + 50], joined_sentence_chunk[num_chars // 2 - 50:]
            chunk_dict1 = make_dict(chunk1, item)
            chunks_and_texts.append(chunk_dict1)
            chunk_dict2 = make_dict(chunk1, item)
            chunks_and_texts.append(chunk_dict2)
        elif num_chars <= 2000 and num_chars > 100:
            chunk_dict = make_dict(joined_sentence_chunk, item)
            chunks_and_texts.append(chunk_dict)

print(len(chunks_and_texts))


  0%|          | 0/8328 [00:00<?, ?it/s]

19642


In [7]:
df = pd.DataFrame(chunks_and_texts)
df.describe().round(2)


Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,19642.0,19642.0,19642.0,19642.0
mean,319.56,953.0,158.08,238.25
std,203.85,446.13,77.93,111.53
min,0.0,101.0,10.0,25.25
25%,146.0,587.0,97.0,146.75
50%,294.5,969.0,160.0,242.25
75%,477.0,1281.0,211.0,320.25
max,817.0,2000.0,945.0,500.0


In [8]:
#See what the outliers look like

min_token_length = 30
max_token_length = 450
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

for row in df[df["chunk_token_count"] >= max_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')


Chunk token count: 26.75 | Text: 262–263, exercise 15 on p. 329, or exercise 11 on p. 488, which all use a technique called “the sign test.”
Chunk token count: 28.5 | Text: So the SE for the sum of 100 draws is √ 100 × 1/2 = 5.The number of heads will be around 50, give or take 5 or so.
Chunk token count: 28.5 | Text: To get at the size of the chance error, the best thing to do is to repeat the measurement several times.The spread
Chunk token count: 25.5 | Text: It has kept people from selling who did not like the way his stock was acting and would have liqui23.9
Chunk token count: 25.5 | Text: My people say the market is entitled to a reaction and that I’ll be able to buy it back cheaper.So 5.6
Chunk token count: 491.75 | Text: 536 22.FINITE MIXTURE MODELS restrict µ1 < µ2 < · · ·< µH in the prior distribution so that the higher indexed components have higher means.However, there are some clear drawbacks to such an approach.Most interesting models in applications are multivariate, and

In [9]:
# Turn table of page embeddings into vector db for retrieval
# We use https://huggingface.co/sentence-transformers/all-mpnet-base-v2
from sentence_transformers import SentenceTransformer
device = "mps:0"
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device)


In [10]:
text_chunks = df["sentence_chunk"]
# 2min 43 secs on GPU 
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=64,
                                               convert_to_tensor=True)

text_chunk_embeddings = text_chunk_embeddings.to('cpu').numpy()
embeddings_df = pd.DataFrame(text_chunk_embeddings)


In [11]:


data_df_save_path = "localRAG.csv"
df.to_csv(data_df_save_path, index=False)

embeddings_df_save_path = "localRAG_embs.csv"
embeddings_df.to_csv(embeddings_df_save_path, index=False)


"""
#To load in df
df = pd.read_csv(data_df_save_path)
embeddings_df = pd.read_csv(embeddings_df_save_path)
"""



'\n#To load in df\ndf = pd.read_csv(data_df_save_path)\nembeddings_df = pd.read_csv(embeddings_df_save_path)\n'