## Create instruct dataset with Mixtral

In [None]:
pip install langchain

### Create a Mixtral 8x7B endpoint

In [None]:
from PIL import Image
import base64
import json
import os

from utils import *

from sagemaker.jumpstart.model import JumpStartModel

smr_client = boto3.client("sagemaker-runtime")

# Set the endpoint name if using the existing endpoint
llm_endpoint_name = None
initial_instance_count = 1 # change it to a large value when running on the whole dataset
if llm_endpoint_name is None:
    model_id = "huggingface-llm-mixtral-8x7b-instruct"
    model = JumpStartModel(model_id=model_id)
    predictor = model.deploy(initial_instance_count=initial_instance_count)
    llm_endpoint_name = predictor.endpoint

### Prepare dataset

#### Data selection
Only choose the samples used in `llava_v1_5_mix665k`. It contains the following data source:

<table>
    <tr><th>Data</th><th>Size</th></tr>
    <tr><td>LLaVA</td><td>158K</td></tr>
    <tr><td>ShareGPT</td><td>40K</td></tr>
    <tr><td>GQA</td><td>72K</td></tr>
    <tr><td>OKVQA</td><td>9K</td></tr>
    <tr><td>OCRVQA</td><td>80K</td></tr>
    <tr><td>A-OKVQA</td><td>50K</td></tr>
    <tr><td>TextCaps</td><td>22K</td></tr>
    <tr><td>RefCOCO</td><td>30K</td></tr>
    <tr><td>VG</td><td>86K</td></tr>
</table>

In [None]:
!cd dataset && bash prepare_dataset.sh

In [None]:
dataset_dir = "dataset"

with open(os.path.join(dataset_dir, 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json'), 'r') as f:
    mix665k = json.load(f)

In [None]:
mix665_dict = {}

turn_num = []
for ins_data in mix665k:
    if ins_data['id'] in mix665_dict:
        mix665_dict[ins_data['id']].append(ins_data)
    else:
        mix665_dict[ins_data['id']] = [ins_data]
        turn_num.append(len(ins_data['conversations'])/2)

Choose the dataset

In [None]:
dataset_name = 'coco'

def load_coco(coco_caption_path):
    with open(coco_caption_path, 'r') as f:
        coco_caption = json.load(f)

    coco_cap_new = {}
    for cap in coco_caption['annotations']:
        if cap['image_id'] in coco_cap_new:
            coco_cap_new[cap['image_id']].append(cap['caption'])
        else:
            coco_cap_new[cap['image_id']] = [cap['caption']]
            
    return coco_cap_new

caption_path = os.path.join(dataset_dir, "annotations/captions_train2017.json")
coco_cap_new = load_coco(caption_path)

Filter dataset as`llava_v1_5_mix665k.json` mixting multiple dataset.

In [None]:
filtered_data = {}

for img_id, ann in mix665_dict.items():
    if 'image' in ann[0] and dataset_name in ann[0]['image']:
        filtered_data[str(int(img_id))] = ann
coco_ids = [img_id for img_id in filtered_data.keys()]

Only choose `image_id` existing in `llava_v1_5_mix665k.json` for performance comparison

In [None]:
filtered_coco_caps = {coco_id: coco_cap_new[int(coco_id)] for coco_id in coco_ids if int(coco_id) in coco_cap_new}

In [None]:
len(filtered_coco_caps)

In [None]:
filtered_coco_caps

### Run inference

In [None]:
import time

digit_num = 12
cnt = 0
cap_list = {}
conv_list = []

instructions_generator, _ = build_conv_instruction_prompt()

for img_id, cap in filtered_coco_caps.items():
    # Generate answers
    print(img_id)
    start_time = time.time()
    image_caption = '\n'.join(cap)
    instruct_data = instructions_generator.copy()
    instruct_data.append({"role":"user", "content":image_caption})
    prompt = format_instructions(instruct_data)
    payload = {
        "inputs": prompt,
        "parameters": {"max_new_tokens": 5000, "do_sample": True}
    }
    response = query_endpoint(payload, llm_endpoint_name)
    qa_pair = response[0]['generated_text']
    
    filtered_qa = check_qa_pair(qa_pair, image_caption, llm_endpoint_name) 
    print(filtered_qa)
    img_id = img_id.zfill(digit_num)
    conv_list.append({
        "id": img_id,
        "image":f"coco/train2017/{img_id}.jpg",
        "conversations": parse_qa_response(filtered_qa)
    })
    print(f'{img_id} inference duration is {time.time() - start_time}')
    
    cap_list[img_id] = cap
    if cnt > 10:
        break
    cnt += 1

In [None]:
cap_list

### Save result for comparison with instruct dataset generated with GPT4

In [None]:
with open('caps_coco.json', 'w') as f:
    json.dump(cap_list, f, indent=4)

In [None]:
import json

with open('conv_mixtral.json', 'w') as f:
    json.dump(conv_list, f, indent=4)

Go to [dataset_analyze.ipynb](dataset_analyze.ipynb) for further analysis.

## Delete endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint()