# Optional: Run evaluation on your fine-tuned model

In [None]:
%pip install -r ./requirements.txt --quiet

In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(restart=True)

<div style="background-color: #F29D9F; border-left: 5px solid #FC0307; padding: 10px; color: black;">
    Please wait 3-5 seconds for the kernel to restart
</div>

In [None]:
import os
from pathlib import Path
import pandas as pd
import sagemaker
from IPython.display import JSON, Video
from huggingface_hub import snapshot_download

In [None]:
role = sagemaker.get_execution_role()
region = sagemaker.session.Session().boto_region_name
session = sagemaker.Session()
default_bucket_name = session.default_bucket()
dataset_dir = "./local_data"

In [None]:
file_path = snapshot_download(
    repo_id="malterei/LLaVA-Video-small-swift",
    repo_type="dataset",
    local_dir=dataset_dir
)
print(f"Downloaded dataset to local filepath: {file_path}")

## Get the fine-tuned model for evaluation

In [None]:
base_prefix = 'multi-modal-finetune'
models_list_s3 = !aws s3api list-objects-v2 --bucket {default_bucket_name} --prefix {base_prefix} --query "Contents[?contains(Key, 'output/model.tar.gz')]|sort_by(@, &LastModified)[-1].Key" --output text
print(f"found {models_list_s3[0]}")

In [None]:
model_suffix_s3 = models_list_s3[0]
model_s3_path = os.path.join("s3://", default_bucket_name, model_suffix_s3)
print(f"Fine-tuned Model Adapter: {model_s3_path}")

if not model_s3_path.endswith("model.tar.gz"):
    assert False, "No latest fine-tuning found. Did your fine-tuning finish?"

In [None]:
model_weights_dir = "./model"
model_destination = f"{model_weights_dir}/{model_suffix_s3}"
model_dest_dir = str(Path(model_destination).parent)

In [None]:
!aws s3 cp {model_s3_path} {model_destination}
!tar -xzvf {model_destination} --directory {model_dest_dir} > /dev/null

In [None]:
model_identifier = "qwen2-vl-2b-instruct"

In [None]:
model_dir = os.path.join(model_dest_dir, model_identifier)

In [None]:
from utils.helpers import find_latest_version_directory, find_best_model_checkpoint

latest_version = find_latest_version_directory(model_dir)
logging_file = os.path.join(os.getcwd(), model_dir, latest_version, "logging.jsonl")
best_model_checkpoint = find_best_model_checkpoint(logging_file)
if best_model_checkpoint:
    best_model_checkpoint = best_model_checkpoint.replace("/opt/ml/model/","")
    print(f"best model checkpoint: {best_model_checkpoint}")
else:
    print("Best model checkpoint not found. Please search the logs manually to find the path that stores the best model checkpoint.")
          

## Run Batch Inference for Evaluation

In [None]:
test_data_file = "test.jsonl"
eval_results_path = "outputs"
model_ckpt_path = os.path.join("..", model_dest_dir, best_model_checkpoint)

In [None]:
from swift.llm import (
    InferArguments, ModelType, infer_main, merge_lora
)


import torch
import json

model_type = ModelType.qwen2_vl_2b_instruct


torch.cuda.empty_cache()

os.environ["NFRAMES"]=json.dumps(24) # can be increased, but will require more memory
os.environ["MAX_PIXELS"]=json.dumps(100352) #400*28*28 # can be increased, but will require more memory
os.environ["CUDA_VISIBLE_DEVICES"]="0" # devices to be used
os.environ["NPROC_PER_NODE"]="4" # we have 4 GPUs on this instance
os.environ["USE_HF"]="1" # use huggingface

old_work_dir = os.getcwd()
os.chdir(dataset_dir)

try:
    infer_args = InferArguments(
        model_type=model_type,
        ckpt_dir=model_ckpt_path,
        result_dir=os.path.join("..",eval_results_path),
        val_dataset=test_data_file,
        max_length=2048
    )
    
    # merge_lora(infer_args, device_map='cuda:0')
    
    infer_main(infer_args)
finally:
    os.chdir(old_work_dir)



torch.cuda.empty_cache()

## Metrics Calculation

In [None]:
%pip install evaluate --quiet

In [None]:
import evaluate

In [None]:
exact_match = evaluate.load("exact_match") 

In [None]:
def find_latest_jsonl(directory):
    # Get all jsonl files
    files = [f for f in os.listdir(directory) if f.endswith('.jsonl')]
    
    if not files:
        return None
    
    # Sort by filename (timestamp) and get the latest
    latest_file = sorted(files, reverse=True)[0]
    
    return os.path.join(directory, latest_file)

In [None]:
import re
def extract_choice(text):
    """Extract the letter choice (A, B, C, or D) from text"""
    # Match first occurrence of A, B, C, or D, followed by optional dot or period
    match = re.search(r'^([ABCD])[.\s]?', str(text).strip())
    return match.group(1) if match else text.strip()

In [None]:
def calculate_accuracy(df):
    # Apply the function to create new columns
    df['label_choice'] = df['label'].apply(extract_choice)
    df['response_choice'] = df['response'].apply(extract_choice)
    results = exact_match.compute(
        references=df['label_choice'].tolist(),
        predictions=df['response_choice'].tolist(),
        ignore_case=True,  # Ignore case differences
        ignore_punctuation=True  # Ignore punctuation differences
    )
    return results

In [None]:
qwen2_2b_fine_tuned_responses_file = find_latest_jsonl(eval_results_path)
qwen2_2b_fine_tuned_responses = pd.read_json(qwen2_2b_fine_tuned_responses_file, lines=True)

In [None]:
print("Accuracy of fine-tuned model:")
calculate_accuracy(qwen2_2b_fine_tuned_responses)

In [None]:
qwen2_2b_fine_tuned_responses.to_json("./evaluation/qwen2-vl-2b-instruct/small/outputs.jsonl", lines=True, orient="records")

In [None]:
qwen2_2b_fine_tuned_responses

In [None]:
# Show incorrect predictions
incorrect = qwen2_2b_fine_tuned_responses[qwen2_2b_fine_tuned_responses['label_choice'] != qwen2_2b_fine_tuned_responses['response_choice']]
print("\nIncorrect predictions:")
incorrect[['label', 'response', 'label_choice', 'response_choice']]