# 🦙1️⃣: Convert PDFs into Images
pdf2image: https://pypi.org/project/pdf2image/

In [None]:
import os
from pdf2image import convert_from_path
# import fitz

first_page_only = True

pdf_folder = '/home/exouser/Chat-with-your-Research-Articles-LLM-Retrieval-Augmented-Generation/pdfs'
image_folder = '/home/exouser/Chat-with-your-Research-Articles-LLM-Retrieval-Augmented-Generation/images'

if not os.path.exists(image_folder):
    os.makedirs(image_folder)

for filename in os.listdir(pdf_folder):
    print(f'filename: {filename}\n')
    if filename.endswith('.pdf'):
        file_path = os.path.join(pdf_folder, filename)
        print(f'file_path: {file_path}\n')
        
        output_folder = os.path.join(image_folder, os.path.splitext(filename)[0])
        print(f'output_folder: {output_folder}\n')
        
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        images = convert_from_path(file_path)
        
        for i, image in enumerate(images):
            image_filename = f"page_{i+1:03d}.jpeg"
            image_path = os.path.join(output_folder, image_filename)
            image.save(image_path, 'JPEG')
            print(f'image_path: {image_path}\n')
            
        if first_page_only: 
            break
    if first_page_only: 
        break

print('PDFs converted to images!')

# 🦙2️⃣: Extract Text from Images, Join on Newlines, and Sentence Tokenize
pytesseract: https://pypi.org/project/pytesseract/

PIL: https://pypi.org/project/pillow/

nltk: https://www.nltk.org/

In [None]:
import pytesseract
from PIL import Image
import nltk
nltk.download('punkt')
import pandas as pd

first_page_only = False
sentence_data = []

first_loop = True
for dir, subdirs, files in os.walk(image_folder):
    if first_loop:
        # walk first loop is current directory, nothing in it
        first_loop = False
        continue
        
    #print(f'dir: {dir}, subdirs: {subdirs}, files: {files}\n')
    for page_num, file in enumerate(sorted(files)):
        #print(f'file: {file}\n')
        
        if file.endswith('.jpeg'):
            doc_name = dir.split('/')[-1]
            #processed_text_doc['doc_name'] = doc_name
            page = file.split('.')[0].split('_')[-1]
            print(f'doc_name: {doc_name}, page: {page}\n')
            
            file_path = os.path.join(dir, file)
            image = Image.open(file_path)
            extracted_text = pytesseract.image_to_string(image)
            print('\n🟠 EXTRACTED:')
            print(f'{extracted_text.replace("\n","➡️\n")}\n')

            joined_extracted_text = extracted_text.replace("\n"," ")
            print('\n🟡 JOINED:')
            print(f'{joined_extracted_text.replace("\n","➡️\n")}\n')
            
            tokenized_sentences = nltk.sent_tokenize(joined_extracted_text)
            print('\n🟢 TOKENIZED:')
            print(*[sentence for sentence in tokenized_sentences], sep='\n\n')

            for sentence_num, sentence in enumerate(tokenized_sentences):
                sentence_data.append({"doc_name": doc_name, "page_num": page_num + 1, "sentence_num": sentence_num + 1, "sentence": sentence})
            #processed_text_doc['page_tokenized_sentences'] = tokenized_sentences
                
            if first_page_only: 
                break
                
    #processed_text_doc_list.append(processed_text_doc)
                
    if first_page_only and not first_loop: 
        break
        
sentence_df = pd.DataFrame(sentence_data)                                  
print('Conversion complete!')

In [None]:
from IPython.display import display
if not first_page_only:
    sentence_df.to_csv("processed_text_doc_list.csv", index=False)
display(sentence_df)

# 🦙3️⃣: Create Embeddings

In [None]:
import chromadb

sentence_df = pd.read_csv("processed_text_doc_list.csv")
documents = sentence_df['sentence'].tolist()

ids = []
metadatas = []
for index, row in sentence_df.iterrows():
    ids.append(str(index))
    metadatas.append({
        "doc_name": row['doc_name'],
        "page_num": row['page_num'],
        "sentence_num": row['sentence_num']
    })

client = chromadb.Client()
#client.delete_collection(name="docs")
collection = client.create_collection(name="docs", get_or_create=True)


collection.add(
    ids = ids,
    documents = documents,
    metadatas = metadatas,
)

print("embeddings created")

# 🦙4️⃣: Prompt Ollama Function

In [None]:
def prompt_ollama(llm_model, system_prompt = '', user_prompt = '', temperature = 0.8, stream = True):
    response  = ollama.chat(
        model = llm_model,
        messages = [{
            'role': 'system', 
            'content': system_prompt,
        },
        {
            'role': 'user',
            'content': user_prompt,
        }],
        options = {'temperature': temperature},
        stream = stream,
    )

    if stream:
        for chunk in response :
            print(chunk['message']['content'], end = '', flush=True)
            return chunk['message']['content']
    else:
        print(response ['message']['content'])
        return response ['message']['content']

# 🦙5️⃣: Query Embeddings, get Alternative Queries from LLM

In [None]:
import ollama
import json

llm_model = 'llama3:8b'

system_prompt = "Provide three alternative ways to ask the given query to maximize relevant semantic search results. Write only the 3 sentences separated with one new line and no additional text or numbering."

query = "What is the typical length of a dream?"

temperature = 0.5

stream = False

response = prompt_ollama(llm_model, system_prompt, query, temperature, stream)

filtered_response = [item for item in response.split('\n') if item != '']

results_per_prompt = 15

results_list = []
for item in filtered_response:
    print(item)
    results_list.append(collection.query(
        query_texts=item,
        n_results=results_per_prompt
    ))
    

combined_documents_lists = []
combined_doc_name_lists = []
combined_page_num_lists = []
combined_sentence_num_lists = []
combined_distance_lists = []
for result in results_list:
    #print(json.dumps(result, indent=1))
    combined_documents_lists += result['documents'][0]
    combined_distance_lists += result['distances'][0]
    for metadata in result['metadatas'][0]:
        combined_doc_name_lists += [metadata['doc_name']]
        combined_page_num_lists += [metadata['page_num']]
        combined_sentence_num_lists += [metadata['sentence_num']]

response_df = pd.DataFrame({
    'documents': combined_documents_lists,
    'doc_name': combined_doc_name_lists,
    'page_num': combined_page_num_lists,
    'sentence_num' : combined_sentence_num_lists,
    'distances': combined_distance_lists
})

display(response_df)
df_unique_responses = response_df.drop_duplicates(subset='documents').reset_index(drop=True)
display(df_unique_responses)

# 🦙6️⃣: Get Surrounding Sentences for each Match

In [None]:
# Function to get concatenated sentences 3 above and 3 below
def get_surrounding_sentences_concatenated(row, all_sentences_df, window=3):
    doc_name = row['doc_name']
    page_num = row['page_num']
    sentence_num = row['sentence_num']
    
    # Filter the full dataframe for the same document and page
    filtered_df = all_sentences_df[(all_sentences_df['doc_name'] == doc_name) & 
                                   (all_sentences_df['page_num'] == page_num)]
    
    # Get the indices of sentences within the window
    surrounding_indices = range(sentence_num - window, sentence_num + window + 1)
    
    # Get the surrounding sentences
    surrounding_sentences = filtered_df[filtered_df['sentence_num'].isin(surrounding_indices)]
    
    # Concatenate the sentences into a single string
    concatenated_sentences = ' '.join(surrounding_sentences['sentence'].tolist())
    
    # Return the concatenated sentences and the first sentence number in the group
    return pd.Series([concatenated_sentences, surrounding_sentences['sentence_num'].iloc[0]])

# Apply the function to each row in df_unique_responses and create new columns
df_unique_responses[['surrounding_sentences', 'first_sentence_num']] = df_unique_responses.apply(
    lambda row: get_surrounding_sentences_concatenated(row, sentence_df), axis=1
)

# Adjust the df_unique_responses to have the correct values
df_unique_responses['sentence_num'] = df_unique_responses['first_sentence_num']
df_unique_responses.drop(columns=['first_sentence_num'], inplace=True)

# Display the df_unique_responses with the new columns
#print(df_unique_responses)

# Ensure expanded_list is a list of dictionaries
expanded_list = [entry for entry in df_unique_responses.to_dict(orient='records')]
#print(expanded_list)

# Display results
combined_sentences = [entry['surrounding_sentences'] for entry in expanded_list]

prompt_data = ''
for i, text in enumerate(combined_sentences):
    prompt_data += f"[{i+1}] {text}\n"

# Display prompt data
print(prompt_data)

# Display results with metadata
for i, entry in enumerate(expanded_list):
    distance = entry['distances']
    #print(f'\n[{i+1}][Distance:{distance:.3f}, Document: {entry["doc_name"]}, Page: {entry["page_num"]}, Sentence: {entry["sentence_num"]}]\n{entry["surrounding_sentences"]}\n')

# 🦙7️⃣: Prompt LLM with Query and Provided Information

In [None]:
system_prompt = f"Answer the query using only the provided information\n\nDo your best to completely and succinctly answer the query, some provided data may not be relevant to the query and does not need to be included. Don't provide information not relvant to the query. Include the number of the reference data e.g. [1], [2], etc. Also keep in mind that this is extracted from a PDF and may have lines inserted that aren't part of the narrative." 
user_prompt = f"Query: {query}\n\nProvided information:\n\n{prompt_data}"

print(f'query: {query}\n')
stream = ollama.chat(
    model = llm_model,
    messages = [{
        'role': 'system', 
        'content': system_prompt,
    },
    {
        'role': 'user',
        'content': user_prompt,
    }],
    options = {'temperature': 0.8},
    stream = True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)

print("\n\n[[Provided Data]]:")

for i, entry in enumerate(expanded_list):
    print(entry)
    distance = entry['distances']
    print(f'\n[{i+1}][Distance:{distance:.3f}, Document: {entry['doc_name']}, Page: {entry['page_num']}, Sentence: {entry['sentence_num']}]\n{entry['surrounding_sentences']}\n')