# Import Packages

In [1]:
import openai
from openai import OpenAI
from openai.types.chat.chat_completion import ChatCompletion
from openai.types.chat.completion_create_params import ResponseFormat
import base64
import pandas as pd
import os
import json
import math
from typing import Optional, Literal, List, Tuple, Union, Literal
from tqdm import tqdm

import base64
from PIL import Image
import matplotlib.pyplot as plt
import io

In [34]:
MAIN_DIR = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(MAIN_DIR, "data")
ARTIFACT_DIR = os.path.join(MAIN_DIR, "artifacts")

with open(os.path.join(MAIN_DIR, "auth", "api_keys.json"), "r") as f:
    api_keys = json.load(f)
    
openai.api_key = api_keys["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = api_keys["OPENAI_API_KEY"]

In [3]:
# Function to encode the image
def encode_image(image_path: str, resize: Optional[Union[int, Tuple[int, int], Literal["auto"]]] = None):
    with open(image_path, "rb") as image_file:
        img_64_str = base64.b64encode(image_file.read()).decode('utf-8')
    
    if resize:
        img_data = base64.b64decode(img_64_str)
        img = Image.open(io.BytesIO(img_data))
        h, w = img.size
        if isinstance(resize, int):
            resize = (resize, resize)
        elif resize == "auto":
            if h < 512 and w < 512:
                resize = (h, w)
            elif h > w:
                resize = (512, int(w / (h/512)))
            else:
                resize = (int(h / (w/512)), 512)
                
        resized_img = img.resize(resize)
        
        # Save the resized image to a buffer
        buffer = io.BytesIO()
        resized_img.save(buffer, format="PNG")
        buffer.seek(0)
        
        # Encode the resized image to base64
        return base64.b64encode(buffer.getvalue()).decode("utf-8")

    else:
        return img_64_str
    
def generate_img_url(image_path: str, resize: Optional[Union[int, Tuple[int, int], Literal["auto"]]] = None):
    encoded_image = encode_image(image_path, resize=resize)
    image_url = f"data:image/jpeg;base64,{encoded_image}"
    return image_url

class TokenCounter:
    def __init__(self):
        self.token_counter = {}
        self.cost_counter = {"prompt": 0, "completion": 0, "total": 0}
        self.token_cost = {
            "gpt-4-1106-preview": {"prompt_tokens": 0.01, "completion_tokens": 0.03},
            "gpt-4-vision-preview": {"prompt_tokens": 0.01, "completion_tokens": 0.03},
            "gpt-4-1106-vision-preview": {"prompt_tokens": 0.01, "completion_tokens": 0.03},
            "gpt-3.5-turbo-1106": {"prompt_tokens": 0.001, "completion_tokens": 0.002},
        }
    
    def update(self, response: ChatCompletion, verbose: bool = False):
        model_name = response.model
        prompt_tokens = response.usage.prompt_tokens
        completion_tokens = response.usage.completion_tokens
        if verbose:
            print("Latest API Call on {} model usage: Prompt Tokens - {}, Completion Tokens - {}"
                .format(model_name, prompt_tokens, completion_tokens))
        if model_name in self.token_counter:
            self.token_counter[model_name]["prompt_tokens"] += prompt_tokens
            self.token_counter[model_name]["completion_tokens"] += completion_tokens
        else:
            self.token_counter[model_name] = {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}
        self.update_cost(prompt_tokens, completion_tokens, model_name, verbose=verbose)
            
    def update_cost(self, prompt_tokens: int, completion_tokens: int, model_name: str = "gpt-4-vision-preview",
                    verbose: bool = False):
        prompt_unit_cost = self.token_cost[model_name]["prompt_tokens"]
        completion_unit_cost = self.token_cost[model_name]["completion_tokens"]
        prompt_cost = prompt_tokens / 1000 * prompt_unit_cost
        completion_cost = completion_tokens / 1000 * completion_unit_cost
        if verbose:
            print("Latest API Call on {} model. Cost: Prompt Cost - {}, Completion Cost - {}"
                .format(model_name, prompt_cost, completion_cost))
        self.cost_counter["prompt"] += prompt_cost
        self.cost_counter["completion"] += completion_cost
        self.cost_counter["total"] += (prompt_cost + completion_cost)
            
    def reset(self):
        self.token_counter = {}
        self.cost_counter = {"prompt": 0, "completion": 0, "total": 0}
        
def extract_score(response_str: str, client: OpenAI, token_counter: Optional[TokenCounter] = None):
    system_prompt = """You are given a response containing the BBPS grading.
    Extract the BBPS score given in the response. If the score is not available, return empty string.
    Your output should be a JSON dictionary with key "Score" and value containing integer score. 
    Example Output: {"Score": 0}, {"Score": ""}
    """
    user_prompt = f"Response: {response_str}"
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
            {"role": "user", "content": [{"type": "text", "text": user_prompt}]}
        ],
        max_tokens=128, temperature=0,
        response_format=ResponseFormat(type="json_object")
    )
    if token_counter:
        token_counter.update(response)
    
    return json.loads(response.choices[0].message.content, strict=False)

def calculate_token_img(w: int, h: int, quality: Literal["low", "high"] = "high"):
    if quality == "low":
        return 85
    
    if w > 2048 and h > 2048:
        if w <= h:
            w = int(w / (h / 2048))
            h = 2048
        else:
            h = int(h / (w/2048))
            w = 2048
    elif w > 2048:
        h = int(h / (w/2048))
        w = 2048
    elif h > 2048:
        w = int(w / (h / 2048))
        h = 2048    
    if w > 512 and h > 512:
        if w >= h:
            w = int(w / (h/768))
            h = 768
        else:
            h = int(h / (w/768))
            w = 768
        
    no_of_tiles = math.ceil(w/512) * math.ceil(h/512)
    return 170 * no_of_tiles + 85

def save_checkpoint(
    ckpt_folder: str,
    img_paths: List[str],
    gpt_raw_answers: List[str],
    gpt_scores: List[str]
):
    content = dict(
        img_paths=img_paths,
        gpt_raw_answers=gpt_raw_answers,
        gpt_scores=gpt_scores
    )
    
    if not os.path.exists(ckpt_folder):
        os.makedirs(ckpt_folder, exist_ok=True)
    with open(os.path.join(ckpt_folder, "ckpt.json"), "w") as f:
        json.dump(content, f)

In [35]:
token_counter = TokenCounter()
client = OpenAI()

# Nerthus Dataset

In [5]:
filenames = os.listdir(os.path.join(DATA_DIR, "nerthus", "eval_set"))
ids = [int(filename.split("_")[1]) for filename in filenames]
gt_scores = [int(filename.split("_")[3].split("-")[0]) for filename in filenames]
save_folder = os.path.join(ARTIFACT_DIR, "nerthus")

if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)

## Only Text Context

In [None]:
task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
If you don't know the answer, say 'I don't know' do not try to make up an answer.
=====
TASK:
You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
2. Based on the GRADING CRITERIA and EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
=====
GRADING CRITERIA: Use the following BBPS Grading Criteria to determine the Grade of the given bowel image.
Grade 0: Unprepared colon segment with mucosa not seen due to solid stool that cannot be cleared
Grade 1: Portion of mucosa of the colon segment seen, but other areas of the colon segment not well seen due to staining, residual stool and/or opaque liquid
Grade 2: Minor amount of residual staining, small fragments of stool and/or opaque liquid, but mucosa of colon segment seen well
Grade 3: Entire mucosa of colon segment seen well with no residual staining, small fragments of stool or opaque liquid. The wording of the scale was finalized after incorporating feedback from three colleagues experienced in colonoscopy.
=====
"""

query_prompt="Analyse this bowel image and return the BBPS score"

In [None]:
SEED = 2023
model_name = "gpt-4-vision-preview"
text_responses = []
text_gpt_scores = []

for filename in tqdm(filenames, total=len(filenames)):
    query_img_path = os.path.join(MAIN_DIR, "data", "nerthus", "eval_set", filename)
    query_img_url = generate_img_url(query_img_path)
    gptv_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": task_system_prompt}
                    ],
            },
            
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query_prompt},
                    {"type": "image_url", "image_url": query_img_url, "detail": "low"},
                    ],
            }
        ],
        max_tokens=512,
        temperature=0,
        seed=SEED
    )
    token_counter.update(gptv_response)
    response_str = gptv_response.choices[0].message.content
    text_responses.append(response_str)
    score_dict = extract_score(response_str, client, token_counter)
    text_gpt_scores.append(score_dict["Score"])

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:01<?, ?it/s]

KeyboardInterrupt



## Few Shot Image

In [195]:
task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
If you don't know the answer, say 'I don't know' do not try to make up an answer.
=====
TASK:
You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
2. Based on the EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
=====
"""

example_system_prompt="""
=====
EXAMPLE:
=====
"""

query_prompt="Analyse this bowel image and return the BBPS score"

In [196]:
sample_img_path_1 = os.path.join(MAIN_DIR, "data", "samples", "example_1.JPG")
sample_img_path_2 = os.path.join(MAIN_DIR, "data", "samples", "example_2.JPG")

sample_img_url_1 = generate_img_url(sample_img_path_1)
sample_img_url_2 = generate_img_url(sample_img_path_2)

In [210]:
SEED = 2023
model_name = "gpt-4-vision-preview"
fs_responses = []
fs_gpt_scores = []

for filename in tqdm(filenames, total=len(filenames)):
    query_img_path = os.path.join(MAIN_DIR, "data", "nerthus", "eval_set", filename)
    query_img_url = generate_img_url(query_img_path)
    gptv_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": task_system_prompt}
                    ],
            },
            {
            "role": "system",
            "content": [
                {"type": "text", "text": example_system_prompt},
                {"type": "image_url", "image_url": sample_img_url_1, "detail": "low"}
            ]
            },
            {
            "role": "system",
            "content": [
                {"type": "text", "text": example_system_prompt},
                {"type": "image_url", "image_url": sample_img_url_2, "detail": "low"}
            ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query_prompt},
                    {"type": "image_url", "image_url": query_img_url, "detail": "low"},
                    ],
            }
        ],
        max_tokens=512,
        temperature=0,
        seed=SEED
    )
    token_counter.update(gptv_response)
    response_str = gptv_response.choices[0].message.content
    fs_responses.append(response_str)
    score_dict = extract_score(response_str, client, token_counter)
    fs_gpt_scores.append(score_dict["Score"])

100%|██████████| 6/6 [00:36<00:00,  6.08s/it]


## Few Shot With Text Context

In [213]:
task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
If you don't know the answer, say 'I don't know' do not try to make up an answer.
=====
TASK:
You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
2. Based on the GRADING CRITERIA and EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
=====
GRADING CRITERIA: Use the following BBPS Grading Criteria to determine the Grade of the given bowel image.
Grade 0: Unprepared colon segment with mucosa not seen due to solid stool that cannot be cleared
Grade 1: Portion of mucosa of the colon segment seen, but other areas of the colon segment not well seen due to staining, residual stool and/or opaque liquid
Grade 2: Minor amount of residual staining, small fragments of stool and/or opaque liquid, but mucosa of colon segment seen well
Grade 3: Entire mucosa of colon segment seen well with no residual staining, small fragments of stool or opaque liquid. The wording of the scale was finalized after incorporating feedback from three colleagues experienced in colonoscopy.
=====
"""

example_system_prompt="""
=====
EXAMPLE:
=====
"""

query_prompt="Analyse this bowel image and return the BBPS score"

In [214]:
sample_img_path_1 = os.path.join(MAIN_DIR, "data", "samples", "example_1.JPG")
sample_img_path_2 = os.path.join(MAIN_DIR, "data", "samples", "example_2.JPG")

sample_img_url_1 = generate_img_url(sample_img_path_1)
sample_img_url_2 = generate_img_url(sample_img_path_2)

In [215]:
SEED = 2023
model_name = "gpt-4-vision-preview"
fs_text_responses = []
fs_text_gpt_scores = []

# for filename in tqdm(filenames, total=len(filenames)):
for filename in filenames:
    query_img_path = os.path.join(MAIN_DIR, "data", "nerthus", "eval_set", filename)
    query_img_url = generate_img_url(query_img_path)
    gptv_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": task_system_prompt}
                    ],
            },
            {
            "role": "system",
            "content": [
                {"type": "text", "text": example_system_prompt},
                {"type": "image_url", "image_url": sample_img_url_1, "detail": "low"}
            ]
            },
            {
            "role": "system",
            "content": [
                {"type": "text", "text": example_system_prompt},
                {"type": "image_url", "image_url": sample_img_url_2, "detail": "low"}
            ]
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query_prompt},
                    {"type": "image_url", "image_url": query_img_url, "detail": "low"},
                    ],
            }
        ],
        max_tokens=512,
        temperature=0,
        seed=SEED
    )
    token_counter.update(gptv_response)
    response_str = gptv_response.choices[0].message.content
    fs_text_responses.append(response_str)
    score_dict = extract_score(response_str, client, token_counter)
    fs_text_gpt_scores.append(score_dict["Score"])

In [216]:
exp_json = []
for filename, id, gt_score, text_raw_answer, text_score, fs_raw_answer, fs_score, fs_text_raw_answer, fs_text_score \
    in zip(filenames, ids, gt_scores, text_responses, text_gpt_scores, fs_responses, fs_gpt_scores, fs_text_responses, fs_text_gpt_scores):
        exp_json.append(
            {
                "filename": filename,
                "id": id,
                "gt_score": gt_score,
                "text_raw_answer": text_raw_answer,
                "text_score": text_score,
                "fs_raw_answer": fs_raw_answer,
                "fs_score": fs_score,
                "fs_text_raw_answer": fs_text_raw_answer,
                "fs_text_score": fs_text_score
            }
        )
        
with open(os.path.join(save_folder, "result.json"), "w") as f:
    json.dump(exp_json, f)

In [220]:
pd_dict = {
    "filename": filenames,
    "id": ids,
    "gt_score": gt_scores,
    "text_raw_answer": text_responses,
    "text_score": text_gpt_scores,
    "fs_raw_answer": fs_responses,
    "fs_score": fs_gpt_scores,
    "fs_text_raw_answer": fs_text_responses,
    "fs_text_score": fs_text_gpt_scores
}

df = pd.DataFrame(pd_dict)
df.to_csv(os.path.join(save_folder, "result.csv"))

# Hyper-Kvasir dataset

In [36]:
token_counter.reset()

In [37]:
low_score_folder = "bbps-0-1"
data_folder = os.path.join(DATA_DIR, "hyper-kvasir")

with open(os.path.join(data_folder, "testcases.txt"), 'r') as fp:
    data = fp.read()
    all_file_paths = data.split("\n") 

gt_scores = [0 if (path.split("/")[3] == low_score_folder) else 1 for path in all_file_paths]
filenames = [path.split("/")[-1] for path in all_file_paths]

assert len(gt_scores) == len(filenames)

save_folder = os.path.join(ARTIFACT_DIR, "hyper-kvasir")

if not os.path.exists(save_folder):
    os.makedirs(save_folder, exist_ok=True)

In [38]:
sample_img_path_1 = os.path.join(MAIN_DIR, "data", "samples", "example_1.JPG")
sample_img_path_2 = os.path.join(MAIN_DIR, "data", "samples", "example_2.JPG")

sample_img_url_1 = generate_img_url(sample_img_path_1)
sample_img_url_2 = generate_img_url(sample_img_path_2)

In [63]:
# # Token Usage Estimation
# tokens_used = []
# ws, hs = [], []
# sample_img_1 = Image.open(sample_img_path_1)
# sample_img_2 = Image.open(sample_img_path_2)
# w1, h1 = sample_img_1.size
# w2, h2 = sample_img_2.size
# sample_img_tokens_1 = calculate_token_img(w1, h1, "high")
# sample_img_tokens_2 = calculate_token_img(w2, h2, "high")
# print("Tokens needed for sample 1:", sample_img_tokens_1)
# print("Tokens needed for sample 2:", sample_img_tokens_2)

# for img_path in all_file_paths:
#     image = Image.open(img_path)
#     w, h = image.size
#     ws.append(w)
#     hs.append(h)
#     img_tokens = calculate_token_img(w, h, "low")
#     tokens_used.append(img_tokens)
    
# query_tokens = sum(tokens_used)
# sample_tokens = (sample_img_tokens_1 + sample_img_tokens_2) * len(all_file_paths)
# total_tokens = query_tokens + sample_tokens
    
# print("Total number of query image tokens required:", query_tokens)
# print("Total number of few-shot tokens reuqired:", sample_tokens)
# print("Total number of image tokens:", total_tokens)
# print("Total image tokens cost:", total_tokens / 1000 * 0.01)

Tokens needed for sample 1: 765
Tokens needed for sample 2: 765
Total number of query image tokens required: 152490
Total number of few-shot tokens reuqired: 2744820
Total number of image tokens: 2897310
Total image tokens cost: 28.9731


## Few Shot With Text Context

In [39]:
# task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
# If you don't know the answer, say 'I don't know' do not try to make up an answer.
# =====
# TASK:
# You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
# The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
# 1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
# 2. Based on the GRADING CRITERIA and EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
# =====
# GRADING CRITERIA: Use the following BBPS Grading Criteria to determine the Grade of the given bowel image.
# Grade 0: Unprepared colon segment with mucosa not seen due to solid stool that cannot be cleared
# Grade 1: Portion of mucosa of the colon segment seen, but other areas of the colon segment not well seen due to staining, residual stool and/or opaque liquid
# Grade 2: Minor amount of residual staining, small fragments of stool and/or opaque liquid, but mucosa of colon segment seen well
# Grade 3: Entire mucosa of colon segment seen well with no residual staining, small fragments of stool or opaque liquid.
# =====
# """

task_system_prompt_reverse = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
If you don't know the answer, say 'I don't know' do not try to make up an answer.
=====
TASK:
You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
2. Based on the GRADING CRITERIA and EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
=====
GRADING CRITERIA: Use the following BBPS Grading Criteria to determine the Grade of the given bowel image.
Grade 3: Entire mucosa of colon segment seen well with no residual staining, small fragments of stool or opaque liquid.
Grade 2: Minor amount of residual staining, small fragments of stool and/or opaque liquid, but mucosa of colon segment seen well
Grade 1: Portion of mucosa of the colon segment seen, but other areas of the colon segment not well seen due to staining, residual stool and/or opaque liquid
Grade 0: Unprepared colon segment with mucosa not seen due to solid stool that cannot be cleared
=====
"""

# task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
# If you don't know the answer, say 'I don't know' do not try to make up an answer.
# =====
# TASK:
# You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
# The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
# 1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
# 2. Based on the EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
# =====
# """

# task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
# If you don't know the answer, say 'I don't know' do not try to make up an answer.
# =====
# TASK:
# You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
# The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Based on the GRADING CRITERIA and EXAMPLES given, return the BBPS grade for the given image.
# =====
# OUTPUT INSTRUCTION:
# Return your output as a single BBPS score which is one of [0, 1, 2, 3].
# =====
# GRADING CRITERIA: Use the following BBPS Grading Criteria to determine the Grade of the given bowel image.
# Grade 0: Unprepared colon segment with mucosa not seen due to solid stool that cannot be cleared
# Grade 1: Portion of mucosa of the colon segment seen, but other areas of the colon segment not well seen due to staining, residual stool and/or opaque liquid
# Grade 2: Minor amount of residual staining, small fragments of stool and/or opaque liquid, but mucosa of colon segment seen well
# Grade 3: Entire mucosa of colon segment seen well with no residual staining, small fragments of stool or opaque liquid.
# =====
# """

# task_system_prompt = """You are an expert endoscopist in charge of bowel preparation for colonoscopy.
# If you don't know the answer, say 'I don't know' do not try to make up an answer.
# =====
# TASK:
# You are given an image of a bowel after cleansing, your task is to assess the quality of the bowel preparation.
# The grading should be performed using the standardized Boston-Bowel-Preparation-Scale (BBPS). Perform the following step:
# 1. Analyse the given image and identify the degree of stool and residual staining and whether mucosa of colon can be seen well.
# 2. Based on the EXAMPLES given, return the BBPS grade for the given image. Can be one of [0, 1, 2, 3]
# =====
# """

# example_system_prompt="""
# =====
# EXAMPLE:
# =====
# """

query_prompt="Analyse this bowel image and return the BBPS score"

In [43]:
SEED = 2023
model_name = "gpt-4-vision-preview"
checkpoint = os.path.join(ARTIFACT_DIR, "hyper-kvasir", "checkpoint", "60", "ckpt.json")
# checkpoint = None
resize = "auto" 
# resize = None

In [44]:
sample_no = 40
total_sample_no = len(all_file_paths)

In [45]:
if checkpoint:
    with open(checkpoint, "r") as f:
        ckpt_content = json.load(f)
    fs_text_responses = ckpt_content["gpt_raw_answers"]
    fs_text_gpt_scores = ckpt_content["gpt_scores"]
    start = int(checkpoint.split("/")[-2])
    end = start + sample_no if sample_no else total_sample_no
    if end > total_sample_no:
        end = total_sample_no

else:
    fs_text_responses = []
    fs_text_gpt_scores = []
    start = 0
    end = start + sample_no if sample_no else total_sample_no

print(f"Run testcase from sample idx {start} to sample idx {end}")
for idx, query_img_path in enumerate(tqdm(all_file_paths[start:end], total=len(all_file_paths[start:end])),
                                     start=start):

# for idx, query_img_path in enumerate(all_file_paths[start:end], start=start):
    query_img_url = generate_img_url(query_img_path, resize=resize)
    gptv_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": task_system_prompt_reverse}
                    ],
            },
            # {
            #     "role": "system",
            #     "content": [
            #         {"type": "text", "text": example_system_prompt},
            #         {"type": "image_url",
            #          "image_url": {"url": sample_img_url_1, "detail": "high"}},
            # ]
            # },
            # {
            #     "role": "system",
            #     "content": [
            #         {"type": "text", "text": example_system_prompt},
            #         {"type": "image_url",
            #          "image_url": {"url": sample_img_url_2, "detail": "high"}},
            # ]
            # },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query_prompt},
                    {"type": "image_url",
                     "image_url": {"url": query_img_url, "detail": "high"}},
                    ],
            }
        ],
        max_tokens=512,
        temperature=0,
        seed=SEED,
    )
    token_counter.update(gptv_response)
    response_str = gptv_response.choices[0].message.content
    fs_text_responses.append(response_str)
    score_dict = extract_score(response_str, client, token_counter)
    fs_text_gpt_scores.append(score_dict["Score"])
    
    if (idx + 1) % 20 == 0:
        ckpt_folder = os.path.join(ARTIFACT_DIR, "hyper-kvasir", "checkpoint", str(idx+1))
        save_checkpoint(ckpt_folder, all_file_paths, fs_text_responses, fs_text_gpt_scores)
        print("Successfully saved checkpoint at folder:", ckpt_folder)

Run testcase from sample idx 60 to sample idx 100


  5%|▌         | 2/40 [06:56<2:11:50, 208.17s/it]


KeyboardInterrupt: 

In [12]:
ckpt_folder = os.path.join(ARTIFACT_DIR, "hyper-kvasir", "checkpoint", str(idx+1))
save_checkpoint(ckpt_folder, all_file_paths, fs_text_responses, fs_text_gpt_scores)
print("Successfully saved checkpoint at folder:", ckpt_folder)

Successfully saved checkpoint at folder: /mnt/c/Users/QUAN/Desktop/bbps_gpt/artifacts/hyper-kvasir/checkpoint/240


In [13]:
exp_json = []
for filename, gt_score, fs_text_raw_answer, fs_text_score \
    in zip(filenames[:end], gt_scores[:end], fs_text_responses, fs_text_gpt_scores):
        exp_json.append(
            {
                "filename": filename,
                "gt_score": gt_score,
                "fs_text_raw_answer": fs_text_raw_answer,
                "fs_text_score": fs_text_score
            }
        )
        
with open(os.path.join(save_folder, f"result_{0}-{end}.json"), "w") as f:
    json.dump(exp_json, f)
    
pd_dict = {
    "filename": filenames[:end],
    "gt_score": gt_scores[:end],
    "fs_text_raw_answer": fs_text_responses,
    "fs_text_score": fs_text_gpt_scores
}

def classify_text_score(score):
    if score == 0 or score == 1:
        return 0
    elif score == 2 or score == 3:
        return 1
    else:
        return -1

df = pd.DataFrame(pd_dict)
df["gpt_classification"] = df["fs_text_score"].apply(lambda x: classify_text_score(x)).astype(int)
df["match"] = df["gt_score"] == df["gpt_classification"]
df.to_csv(os.path.join(save_folder, f"result_{0}-{end}.csv"))

accuracy = df["match"].sum() / df["match"].count()
print("GPT Accuracy:", round(accuracy * 100, 3))

GPT Accuracy: 40.0


In [20]:
df = pd.read_csv(os.path.join(ARTIFACT_DIR, "hyper-kvasir", "run_1", "result_0-1794.csv"))

In [26]:
accuracy = df["match"].sum() / df["match"].count()
print("GPT Accuracy:", round(accuracy * 100, 3))

GPT Accuracy: 83.055


In [21]:
df["gt_score"].value_counts()

gt_score
1    1148
0     646
Name: count, dtype: int64

In [22]:
df["fs_text_score"].value_counts()

fs_text_score
3.0    1058
2.0     353
1.0     260
0.0     101
Name: count, dtype: int64

In [23]:
df["gpt_classification"].value_counts()

gpt_classification
 1    1411
 0     361
-1      22
Name: count, dtype: int64

In [24]:
sum(gt_scores)/len(gt_scores)

1.0

In [25]:
df.groupby("gt_score")["match"].value_counts()

gt_score  match
0         True      357
          False     289
1         True     1133
          False      15
Name: count, dtype: int64

In [27]:
df.groupby("gpt_classification")["match"].value_counts()

gpt_classification  match
-1                  False      22
 0                  True      357
                    False       4
 1                  True     1133
                    False     278
Name: count, dtype: int64

In [31]:
df[df["gpt_classification"] == -1]["gt_score"].sum()

11