# Build a RAG-based radiology report application with Bedrock, Langchain and FAISS index

This notebook explains steps requried to build a summarization application using Retrieval Augmented Generation (RAG) architecture.
RAG combines the power of pre-trained LLMs with information retrieval - enabling more accurate and context-aware responses

## Overview

* Leveraged a dataset of 95,000 radiology report findings-impressions pairs as the knowledge source
* Ingested the dataset into Langchain and generated embedding vectors with Titan Text Embedding model on Amazon Bedrock
* Stored output vector representations in a FAISS vector store for efficient retrieval 
* Set up a pipeline using Langchain, FAISS, and Anthropic Claude v2 for prompt engineering, retrieval, and text generation
* Modular combination of state-of-the-art AI libraries enabled rapid implementation and experimentation with latest RAG techniques

In [None]:
# !pip install faiss-cpu
# !pip install langchain --upgrade
# !pip install pypdf
# !pip install regex

In [None]:
# !pip install sagemaker --upgrade

In [None]:
#!pip install boto3 --upgrade

## Restart Kernel

In [None]:
#Restart Kernel after the installs
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)  

## Setup depedencies

In [None]:
#Check Python version is greater than 3.8 which is required by Langchain if you want to use Langchain
import sys
sys.version

In [None]:
assert sys.version_info >= (3, 8)

In [None]:
import langchain

In [None]:
langchain.__version__

In [None]:
import os, json
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter,NLTKTextSplitter
import pathlib 

## Perform document pre-processing
Load the documents, perform clean-up of the text before generating embeddings

In [None]:
import pandas as pd
train_df = pd.read_csv('train.csv')

In [None]:
#combine the study_id, findings, and impressions from the knowledge source into one text. This will represnet the "page_content" using langchain document loader
train_df['text'] = train_df['study_id'].astype(str) + " Findings: " +  train_df['findings'] + " Impressions: " + train_df['impression']

In [None]:
#train_df = train_df[['text']].sample(5000)
train_df = train_df[['text']]

In [None]:
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(train_df)
data = loader.load()

In [None]:
data[100].page_content

In [None]:
data[100]

## Generate Embeddings
Use an embeddings model to generate embeddings of the cleaned-up doc

In [None]:
import boto3
bedrock = boto3.client('bedrock' , 'us-east-1', endpoint_url='https://bedrock.us-east-1.amazonaws.com')
bedrock.list_foundation_models() 

In [None]:
import boto3
import sagemaker
session = boto3.Session()
sagemaker_session = sagemaker.Session()
studio_region = sagemaker_session.boto_region_name 
bedrock = session.client("bedrock-runtime", region_name=studio_region)

from langchain.embeddings import BedrockEmbeddings
emb = BedrockEmbeddings(region_name ="us-east-1",model_id = "amazon.titan-embed-g1-text-02") #amazon.titan-embed-text-v1
emb.model_kwargs = {}

In [None]:
import boto3
import requests
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest

def sign_request(req, service, region):
    session = boto3.Session()
    credentials = session.get_credentials().get_frozen_credentials()
    # Convert requests.PreparedRequest to AWSRequest
    aws_req = AWSRequest(
        method=req.method,
        url=req.url,
        data=req.body,
        headers=req.headers
    )
    # Sign the AWSRequest with SigV4Auth
    SigV4Auth(credentials, service, region).add_auth(aws_req)
    # Update the original requests.PreparedRequest with the signed headers
    req.headers.update(aws_req.headers)
    return req

# Create a request using the requests library
region = 'us-east-1'
service = "bedrock"
model_id = 'amazon.titan-embed-g1-text-02'
url = f"https://{service}.{region}.amazonaws.com/foundation-model-entitlement"
req = requests.Request('POST', url, json={'modelId': model_id})
prepared_req = req.prepare()
# Sign the request
sign_request(prepared_req, service, region)
# Send the request
session = requests.Session()
response = session.send(prepared_req)
print(f"Attaining Foundation Model Entitlement Status: {response.status_code}, Response: {response.json()}")

## Setup local Vector store - FAISS 

In [None]:
from langchain.vectorstores import FAISS
import pathlib 

In [None]:
print("Embed and create vector index")
db = FAISS.from_documents(data, embedding=emb)

### Save the indices locally as a file

In [None]:
index_path = 'faiss_indices'

In [None]:
index_name = 'reports'

In [None]:
print('Save the index created locally')
pathlib.Path(index_path).mkdir(parents=True, exist_ok=True)
db.save_local(folder_path=index_path, index_name= index_name)

### Load from local file cache

In [None]:
emb

In [None]:
%%time
#Check if load local works properly
db_local = FAISS.load_local(folder_path=index_path, embeddings=emb, index_name=index_name)

### Perform a similarity search and get top 3 matching docs

In [None]:
query = "<PUT SAMPLE RADIOLOGY REPORT IMPRESSION HERE>"
docs = db_local.similarity_search(query, k=3)
docs

## Access LLM with the context from vector store

In [None]:
from langchain.llms.bedrock import Bedrock

#Creating Anthropic Claude
model_args= {'max_tokens_to_sample':200,'temperature':0}
llm = Bedrock(model_id="anthropic.claude-v2", client=bedrock, model_kwargs=model_args)

### Query 1

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [None]:
query = "<PUT SAMPLE RADIOLOGY REPORT IMPRESSION HERE>"
print(query)

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

prompt_template = """Human: Generate radiology report impressions based on the following findings. Return only a single impression and do not return the findings given. Findings: {context}

Question: {question}
Assistant:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)


In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db_local.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}
    ),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

response = qa({'query':query})
print(response['result'])

In [None]:
response['source_documents']

## Model Evaluation

### Dev 1

In [None]:
dev1 = pd.read_csv('dev1.csv')

In [None]:
dev1.head()

In [None]:
#dev1.iloc[:10,2].to_list()

In [None]:
query_list_dev1 = dev1.iloc[:,2].to_list()

In [None]:
len(query_list_dev1)

In [None]:
def generate_reports(query_list):
    results = []
    for query in query_list:
        prompt_template = """
        Human: Generate radiology report impressions based on the following findings. Return only a single impression and do not return the findings given. Findings: {context}
        {question}
        Assistant:"""
        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context", "question"]
        )
        qa = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=db_local.as_retriever(
                search_type="similarity", search_kwargs={"k": 3}
            ),
            return_source_documents=True,
            chain_type_kwargs={"prompt": PROMPT}
        )
        response = qa({'query':query})
        results.append(response['result'])
    return results

In [None]:
result_list_dev1 = generate_reports(query_list_dev1)

In [None]:
len(result_list_dev1)

In [None]:
dev1['rag_claude2_impressions'] = result_list_dev1

In [None]:
dev1['rag_claude2_impressions'] = dev1['rag_claude2_impressions'].str.replace('Impressions:', '')

In [None]:
dev1

In [None]:
# !pip install evaluate
# !pip install rouge_score

In [None]:
import evaluate
from rouge_score import rouge_scorer, scoring
#from transformers import AutoTokenizer, BartTokenizer


rouge_score = evaluate.load("rouge") #"/home/hd/hd_hd/hd_rk435/evaluate/metrics/rouge")
#tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
result_RAGClaude2_dev1 = rouge_score.compute(predictions=list(dev1['rag_claude2_impressions']), references=list(dev1["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for RAG Implentation with Titan Embedding and Claudev2 Model on Dev1 Set:")
print(result_RAGClaude2_dev1)

In [None]:
results_RAGClaude2_dev1_all = rouge_score.compute(predictions=list(dev1['rag_claude2_impressions']), references=list(dev1["impression"]), use_aggregator=False)
results_RAGClaude2_dev1_all_df = pd.DataFrame(results_RAGClaude2_dev1_all)
results_RAGClaude2_dev1_all_df.plot(kind='box', color = 'red')

In [None]:
dev1.to_csv("RAG_results/dev1_rag.csv", index = False)

### Dev 2

In [None]:
dev2 = pd.read_csv('dev2.csv')

In [None]:
dev2.head()

In [None]:
query_list_dev2 = dev2.iloc[:,2].to_list()

In [None]:
result_list_dev2 = generate_reports(query_list_dev2)

In [None]:
len(result_list_dev2)
dev2['rag_claude2_impressions'] = result_list_dev2

In [None]:
dev2['rag_claude2_impressions'] = dev2['rag_claude2_impressions'].str.replace('Impressions:', '')

In [None]:
import pandas as pd
dev2 = pd.read_csv("RAG_results/dev2_rag.csv")

In [None]:
import evaluate
from rouge_score import rouge_scorer, scoring
#from transformers import AutoTokenizer, BartTokenizer

rouge_score = evaluate.load("rouge") #"/home/hd/hd_hd/hd_rk435/evaluate/metrics/rouge")
#tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
result_RAGClaude2_dev2 = rouge_score.compute(predictions=list(dev2['rag_claude2_impressions']), references=list(dev2["impression"]), use_aggregator=True) #, use_stemmer=True) #, tokenizer=tokenizer)
print("ROUGE Score for RAG Implentation with Titan Embedding and Claudev2 Model on Dev2 Set:")
print(result_RAGClaude2_dev2)

In [None]:
results_RAGClaude2_dev2_all = rouge_score.compute(predictions=list(dev2['rag_claude2_impressions']), references=list(dev2["impression"]), use_aggregator=False)
results_RAGClaude2_dev2_all_df = pd.DataFrame(results_RAGClaude2_dev2_all)
results_RAGClaude2_dev2_all_df.plot(kind='box', color = 'blue')

In [None]:
dev2.to_csv("RAG_results/dev2_rag.csv", index = False)