In [1]:
import json
import torch
import os
import sys
import pandas as pd


from PIL import Image
from typing import Dict, List, Tuple
from torch.utils.data import Dataset
from dataclasses import dataclass
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@dataclass
class text_image:
    q_input_ids : torch.Tensor
    pixel_values : torch.Tensor
    a_input_ids : torch.Tensor
    image_grid_thw : torch.Tensor

In [3]:
class LlavaDataset(Dataset):
    def __init__(self, data_file, image_dir):
        super().__init__()

        self.data_file = data_file
        self.image_dir = image_dir
        self.chat_data = pd.read_json(data_file).to_dict(orient="records")

    def __len__(self):
        return len(self.chat_data)
    
    def __getitem__(self, index):
        data = self.chat_data[index]
        messages = data['messages']
        human_input = messages[0]['content']
        gpt_output = messages[1]['content']
        images = data['images'] # List
        images = [os.path.join(self.image_dir,image) for image in images]
        return human_input, gpt_output, images

In [4]:
test_data = LlavaDataset('/mnt/new_disk/qjw/my_llm_code/sft/mllm_demo.json','/')

In [5]:
test_data[0]

('<image>Who are they?',
 "They're Kane and Gretzka from Bayern Munich.",
 ['/mllm_demo_data/1.jpg', '/mllm_demo_data/1.jpg'])

In [6]:
processor = AutoProcessor.from_pretrained('/mnt/new_disk/qjw/ckpt/qwen/Qwen2-VL-7B-Instruct')

In [7]:
processor.tokenizer('<image>')

{'input_ids': [27, 1805, 29], 'attention_mask': [1, 1, 1]}

In [8]:
raw_images = Image.open('/mnt/new_disk/qjw/assert/498339.jpg')
text = 'this is a image'
text = [
    {"role":"user","content":text},
]
text = processor.apply_chat_template(
        text, tokenize=False, add_generation_prompt=True
    )
inputs = processor(text=text, images=[raw_images,raw_images], return_tensors="pt")

In [9]:
inputs.pixel_values.shape

torch.Size([7992, 1176])

In [10]:
processor.tokenizer('this is a image',padding="longest")

{'input_ids': [574, 374, 264, 2168], 'attention_mask': [1, 1, 1, 1]}

In [11]:
inputs['input_ids'].shape

torch.Size([1, 23])

In [12]:
# def process_qa_and_image(qa_and_image : Tuple[str,str,List[str]], processor : AutoProcessor):
#     # 需要直接传一个组数据
    
#     human_input, gpt_output, images = qa_and_image

#     message = [
#         {"role":"user","content":human_input}
#     ]

#     # 再加模板之前，需要转为标准格式，详细见modelscope官网，以便于组batch
#     prompt = processor.apply_chat_template(
#         message, tokenize=False, add_generation_prompt=True
#     )
    
#     # 官方处理吗
#     raw_images = [Image.open(image) for image in images]

#     inputs = processor(text=prompt, images=raw_images, return_tensors="pt")  # .to(0, torch.float16)
#     # 不要将label也一起塞进去，而是单独处理，以方便后续的拼接
#     # inputs = dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
#     # input_ids.shape = torch.Size([1, 23])

#     a_input_ids = processor.tokenizer(
#         gpt_output,
#         return_tensors='pt',
#         padding="longest",
#         truncation=True,
#     )['input_ids']

#     return text_image(
#         q_input_ids=inputs['input_ids'],
#         pixel_values=inputs['pixel_values'],
#         a_input_ids=a_input_ids,
#         image_grid_thw=inputs['image_grid_thw']
#     )

    



    

In [67]:
def process_qa_and_image(qa_and_image : List[Tuple[str,str,List[str]]], processor : AutoProcessor):
    # 需要直接传一个组数据，并组成官方可以支持的数据格式

    messages = []

    gpt_output_list = []
    for one_piece_data in qa_and_image:
        human_input, gpt_output, images = one_piece_data
        gpt_output_list.append(gpt_output)

        ins = human_input.split('<image>')
        content = []
        for idx in range(len(ins)):
            s = ins[idx]

            if s != '':
                content.append({
                    "type":"text",
                    "text":s
                })

            if idx != len(ins) - 1:
                content.append({
                    "type":"image",
                    "image":images[idx]
                })
        message = {
            "role":"user",
            "content":content,
        }
        # 注意每一条数据都是一个列表，不是字典
        messages.append([message])
    

    # ====================官方模板处理====================     
    # 再加模板之前，需要转为标准格式，详细见modelscope官网，以便于组batch
    # 填充是用 '<|endoftext|>' 是left_padding
    prompt = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        for msg in messages
    ]

    image_inputs, video_inputs = process_vision_info(messages)


    q_inputs_list = processor(
        text=prompt,
        images=image_inputs,
        videos=video_inputs,
        padding=False,
    )
    # ====================官方模板处理====================
    # inputs = dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
    a_input_ids = processor.tokenizer(
        gpt_output_list,
        truncation=True,
    )['input_ids']


    return q_inputs_list, a_input_ids

In [68]:
# concat使用示例
x = torch.rand(1,10)
y = torch.concat(
    [x,x],dim=1
)
y.shape # torch.Size([1, 20])

torch.Size([1, 20])

In [69]:
torch.full_like(torch.rand(1,5), fill_value=-100)
# tensor([[-100., -100., -100., -100., -100.]])

tensor([[-100., -100., -100., -100., -100.]])

In [70]:
torch.full((0,), fill_value=-100)

tensor([], dtype=torch.int64)

In [71]:
class LlavaDataCollator:
    def __init__(self,processor: AutoProcessor):
        self.processor = processor
        self.ignore_index = -100

    def processqa_for_train(self, q_input_ids, a_input_ids):
        # 两者形状都是[1,X],直接concat
        # 因为是sft 最后要加上eos token

        if type(q_input_ids) == list:
            q_input_ids = torch.tensor(q_input_ids).reshape(1,-1)
            a_input_ids = torch.tensor(a_input_ids).reshape(1,-1)

        eos_token_id = torch.tensor(self.processor.tokenizer.eos_token_id).reshape(1,-1)

        inputs = torch.concat(
            [q_input_ids,a_input_ids,eos_token_id],
            dim=1
        )
        # sft格式的instruction对应的labels中的值设置为-100
        # 最后的eos token也要计算loss
        labels = torch.concat(
            [torch.full_like(q_input_ids, fill_value=self.ignore_index),a_input_ids,eos_token_id],
            dim=1
        )

        return inputs, labels


    
    def __call__(self, features: List) -> Dict[str, torch.Tensor]:
        input_ids_list = []
        label_ids_list = []
        pixel_values_list = []
        max_len_list = []
        image_grid_thw_list = []

        q_inputs_list, a_input_ids = process_qa_and_image(features,self.processor)

        # for feature in features:
        #     input_ids ,labels = self.processqa_for_train(feature.q_input_ids, feature.a_input_ids)
        #     input_ids_list.append(input_ids)
        #     label_ids_list.append(labels)
        #     pixel_values_list.append(feature.pixel_values)
        #     image_grid_thw_list.append(feature.image_grid_thw)
        #     max_len_list.append(input_ids.shape[1])
        
        # 要将这些数据组成一个batch，这里主要是做了padding
        # 实际sft时还要做截断


        # 将q和a组成训练所需要的格式
        max_len = 0
        for q,a in zip(q_inputs_list['input_ids'], a_input_ids):
            input_ids ,labels = self.processqa_for_train(q, a)
            input_ids_list.append(input_ids)
            label_ids_list.append(labels)
            max_len = max(max_len, input_ids.shape[1])

        # pixel_values_list = torch.tensor(q_inputs_list['pixel_values'])
        # image_grid_thw_list = torch.tensor(q_inputs_list['image_grid_thw'])
            
            
        pad_token_id = self.processor.tokenizer.pad_token_id

        # 这里实现的是right padding 但是训练是无所谓的

        final_input_ids = torch.concat(
            [torch.concat([input_ids,torch.full((1,max_len - input_ids.shape[1]), pad_token_id)],dim=1) for input_ids in input_ids_list],
            dim=0
        )
        final_labels_ids = torch.concat(
            [torch.concat([labels_ids,torch.full((1,max_len - labels_ids.shape[1]), pad_token_id)],dim=1) for labels_ids in label_ids_list],
            dim=0
        )

        # qwen_vl还需要个image_grid_thw参数
        # final_image_grid_thw = torch.concat(
        #     [image_grid_thw for image_grid_thw in image_grid_thw_list],
        #     dim=0
        # )

        # # pixel_values不需要padding
        # final_pixel_values = torch.concat(pixel_values_list,dim=0)
        final_pixel_values = torch.tensor(q_inputs_list['pixel_values'])
        final_image_grid_thw = torch.tensor(q_inputs_list['image_grid_thw'])

        # attenion_mask:pad token的值应该设置为0
        attention_mask = torch.ones_like(final_input_ids)
        attention_mask[final_input_ids == self.processor.tokenizer.pad_token_id] = 0

        return {
            "input_ids":final_input_ids,
            "labels":final_labels_ids,
            "pixel_values":final_pixel_values,
            'image_grid_thw':final_image_grid_thw,
            "attention_mask":attention_mask
        }




In [82]:
from torch.utils.data import DataLoader
processor = AutoProcessor.from_pretrained('/mnt/new_disk/qjw/ckpt/Qwen/Qwen2-VL-2B-Instruct')
dataset = LlavaDataset('/mnt/new_disk/qjw/my_llm_code/sft/train_data.json','/mnt/new_disk/qjw/LLaMA-Factory/data')

In [83]:
data_loader = DataLoader(dataset,2,collate_fn=LlavaDataCollator(processor=processor))

In [85]:
for batch in data_loader:
    print(batch['input_ids'].shape)
    print(batch['image_grid_thw'])

torch.Size([2, 3796])
tensor([[  1, 172,  78],
        [  1, 172,  78]])
torch.Size([2, 4013])
tensor([[  1, 170,  82],
        [  1, 172,  78]])
torch.Size([2, 5117])
tensor([[  1, 198,  92],
        [  1, 148,  84]])
torch.Size([2, 4373])
tensor([[  1, 180,  84],
        [  1, 164,  92]])
torch.Size([2, 4689])
tensor([[  1, 146, 110],
        [  1,  92,  52]])
torch.Size([2, 2942])
tensor([[  1, 138,  78],
        [  1,  72,  84]])
torch.Size([2, 4120])
tensor([[  1, 180,  84],
        [  1, 180,  84]])
torch.Size([2, 4428])
tensor([[  1,  92,  52],
        [  1, 182,  84]])
torch.Size([2, 4544])
tensor([[  1, 192,  88],
        [  1, 172,  78]])


KeyboardInterrupt: 