In [14]:
from dotenv import load_dotenv
import json
import os
import PyPDF2

from sentence_transformers import SentenceTransformer

# from langchain.embeddings import HuggingFaceEmbeddings # depricated
from langchain_huggingface import HuggingFaceEmbeddings 

from langchain.embeddings import OpenAIEmbeddings

# from langchain.vectorstores import Chroma # depricated
from langchain_chroma import Chroma

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

import numpy as np

%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [5]:
# os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
# os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"
os.environ["WOKSPACE_JSON"] = '../static/workspace_directories.json'

In [17]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
st_embeddings = embedding_model.encode("Testing long text")

print(max(st_embeddings), min(st_embeddings), st_embeddings.shape)

0.15626092 -0.15891534 (384,)


In [39]:
# Initialize the Embeddings model and Chroma database
# embedding_model = OpenAIEmbeddings() # Openai
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Load SentenceTransformers model

chroma_db_path = "./chroma_db"  # Path to store Chroma database
vectorstore = Chroma(persist_directory=chroma_db_path, embedding_function=embedding_model)

# Text splitter configuration
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Number of characters per chunk
    chunk_overlap=100,  # Overlap between chunks to maintain context
)

In [7]:
with open(os.environ["WOKSPACE_JSON"]) as f:
    data = json.load(f)

pdf_files_dirs = [os.path.join(data[0]['path'], f) for f in os.listdir(data[0]['path'])]
pdf_files_dirs

['C:\\Users\\abdal\\Downloads\\Testing Pdf preview\\Automated_Mammography_Reporting_through_Image_to_Text_Translation - Copy (2).pdf',
 'C:\\Users\\abdal\\Downloads\\Testing Pdf preview\\BERT Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf',
 'C:\\Users\\abdal\\Downloads\\Testing Pdf preview\\Towards Large-Scale Training of Pathology Foundation Models.pdf']

In [8]:
pdf_files_dirs[0].split('\\')[-1]

'Automated_Mammography_Reporting_through_Image_to_Text_Translation - Copy (2).pdf'

In [40]:
for file in pdf_files_dirs:
    
    with open(file, 'rb') as f:
        # read the pdf
        reader = PyPDF2.PdfReader(f)

        # concat all text
        pdf_full_text = ''.join(page.extract_text() for page in reader.pages)

        # split into chuncks
        pdf_chunks = text_splitter.split_text(pdf_full_text)

        # Create Document objects for each chunk
        pdf_documents = [
            Document(page_content=chunk, metadata={"file_name": file.split('\\')[-1], "chunk_index": idx})
            for idx, chunk in enumerate(pdf_chunks)
        ]
        
        # Add the documents to Chroma
        vectorstore.add_documents(pdf_documents)

In [9]:
# document = Document(pdf_full_text, metadata={"file_name": pdf_doc_path.split('\\')[-1]})
# document

In [14]:
# # Split the text into chunks
# chunks = text_splitter.split_text(pdf_full_text)

# print(len(chunks))
# print(chunks)

In [19]:
# # Create Document objects for each chunk
# documents = [
#     Document(page_content=chunk, metadata={"file_name": pdf_doc_path.split('\\')[-1], "chunk_index": idx})
#     for idx, chunk in enumerate(chunks)
# ]
# documents

In [22]:
# # Add the documents to Chroma
# vectorstore.add_documents(documents)

In [23]:
# vectorstore

In [26]:
# # Perform similarity search
# query = "mammography report generation is performed using a series of zero-shot classification tasks"
# results = vectorstore.similarity_search(query, k=5)

In [42]:
# Get query embedding
query = "mammography report generation approach using mmg-clip"
query_embedding = embedding_model.embed_query(query)
query_embedding

[0.034560106694698334,
 0.052149612456560135,
 -0.007049025967717171,
 0.03573371097445488,
 -0.0349850095808506,
 0.01494691800326109,
 -0.04320145025849342,
 0.03119904361665249,
 -0.0539119653403759,
 -0.010592584498226643,
 0.03282345458865166,
 -0.0709524005651474,
 0.017137382179498672,
 -0.01121582742780447,
 -0.06640471518039703,
 0.018585706129670143,
 -0.07146716862916946,
 0.03463846817612648,
 0.05622696503996849,
 -0.0010497522307559848,
 0.0738152265548706,
 0.032592665404081345,
 -0.016740525141358376,
 -0.04814876243472099,
 0.08016165345907211,
 -0.026738863438367844,
 -0.03598518297076225,
 0.000247649266384542,
 0.03442370891571045,
 0.011950265616178513,
 0.023524945601820946,
 0.03089885227382183,
 0.11152336001396179,
 0.04445668309926987,
 0.004069797694683075,
 -0.02472517639398575,
 0.012798914685845375,
 0.08923646062612534,
 -0.04597149416804314,
 -0.018967395648360252,
 0.006144956685602665,
 -0.020748233422636986,
 0.06660695374011993,
 -0.03569227829575538

In [44]:
# Perform similarity search and retrieve scores
query = "mammography report generation approach using mmg-clip"
results_score = vectorstore.similarity_search_with_score(query, k=5)
results_score

[(Document(metadata={'chunk_index': 28, 'file_name': 'Automated_Mammography_Reporting_through_Image_to_Text_Translation - Copy (2).pdf'}, page_content='not collected or available), and a long Dutch report. It\nconsists of 10,801 exam-report samples. Among all of\nthose samples, only 1832 were applicable to be used,\nexcluding several pathology, biopsy, or duplicates and\nonly selecting mammogram reports. We also extracted\nlabels from the sentences and manually translated them\nto their English labels found in BI-RADS guidelines to\nminimize the translation error.\nMulti-label Prompts are sentences generated ran-\ndomly that contain one or more labels information.\nThese sentences are formed by randomly selecting a\ntemplate sentence describing each label, and concate-\nnating them to form one or more sentences describingMMG-CLIP: Automated Mammography Reporting through Image-to-Text Translation 7\n(a) Model output on two malignancy evaluation prompts\n (b) Model output on three mass s

In [46]:
results_score[0][1]#.metadata#.page_content

0.661310613155365

In [36]:
# Format the response
response = [
    {
        "text": result[0].page_content,
        "file_name": result[0].metadata.get("file_name"),
        "chunk_index": result[0].metadata.get("chunk_index"),
        "similarity": result[1]
    } for result in results_score
]
response

[{'text': 'sian error linear units. CoRR , abs/1606.08415.\nFelix Hill, Kyunghyun Cho, and Anna Korhonen. 2016.\nLearning distributed representations of sentences\nfrom unlabelled data. In Proceedings of the 2016\nConference of the North American Chapter of the\nAssociation for Computational Linguistics: Human\nLanguage Technologies . Association for Computa-\ntional Linguistics.\nJeremy Howard and Sebastian Ruder. 2018. Universal\nlanguage model ﬁne-tuning for text classiﬁcation. In\nACL. Association for Computational Linguistics.\nMinghao Hu, Yuxing Peng, Zhen Huang, Xipeng Qiu,\nFuru Wei, and Ming Zhou. 2018. Reinforced\nmnemonic reader for machine reading comprehen-\nsion. In IJCAI .\nYacine Jernite, Samuel R. Bowman, and David Son-\ntag. 2017. Discourse-based objectives for fast un-\nsupervised sentence representation learning. CoRR ,\nabs/1705.00557.Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke\nZettlemoyer. 2017. Triviaqa: A large scale distantly\nsupervised challenge datas

In [None]:
# view the db using DB Browser for SQLite