# 1. 定义Prompt，读取文件

In [1]:
# SYSTEM_PROMPT = "You are a rigorous and responsible image tampering (altering) detection expert. " \
#     "You can localize the exact tampered region and analyze your detection decision according to tampering clues at different levels. " \
#     "Assuming that you have detected this is a <FAKE> image and the manipulation type is [MANIPULATION_TYPE], " \
#     "the exact tampered region boundary is highlighted with color in this image (and your detection IS correct).\n" \
#     "Please provide the chain-of-clues supporting your detection decision in the following style: " \
#     "# high-level semantic anomalies (such as content contrary to common sense, inciting and misleading content), " \
#     "# middle-level visual defects (such as traces of tampered region or boundary, lighting inconsistency, perspective relationships, and physical constraints) and " \
#     "# low-level pixel statistics (such as noise, color, textural, sharpness, and AI-generation fingerprint), " \
#     "where the high-level anomalies are significant doubts worth attention, and the middle-level and low-level findings are reliable evidence." 


REAL_ANALYSIS_TEXT_LIST = [
    "The following analysis affirms the authenticity of the image, with observations categorized into high-level semantic coherence, " \
    "middle-level visual consistency, and low-level pixel statistics.\n\n" \
    "# High-Level Semantic Coherence\n\n" \
    "1. Alignment with Common Sense\n\n" \
    "[DETAILED-CAPTION]\n" \
    "The content is entirely plausible and aligns with real-world scenarios. The scene authentically reflects a natural and non-misleading setting.\n\n" \
    "# Middle-Level Visual Consistency\n\n" \
    "1. Absence of Boundary Traces or Irregularities\n\n" \
    "All regions of the image exhibit smooth transitions and natural continuity.\n\n" \
    "2. Coherent Lighting\n\n" \
    "The lighting across the image is consistent, with shadows, highlights, and reflections properly aligned to the light source.\n\n" \
    "3. Harmonious Perspective\n\n" \
    "The size, scale, and orientation of all elements are consistent with natural perspective rules. Spatial relationships between objects are logical.\n\n" \
    "4. Adherence to Physical Constraints\n\n" \
    "All interactions and arrangements of objects follow physical laws, such as gravity and balance.\n\n" \
    "# Low-Level Pixel Statistics\n\n" \
    "1. Uniform Color\n\n" \
    "The colors and tones are cohesive, with smooth gradients and consistent blending across the scene.\n\n" \
    "2. Homogeneous Texture and Sharpness\n\n" \
    "The texture and sharpness are evenly distributed, with no areas appearing artificially smoothed, grainy, or oversharpened.",

    "The following analysis supports the authenticity of the image, categorizing observations into high-level semantic coherence, " \
    "middle-level visual consistency, and low-level pixel statistics.\n\n" \
    "# High-Level Semantic Coherence\n\n" \
    "## Consistency with Common Sense\n\n" \
    "[DETAILED-CAPTION]\n" \
    "The image depicts an entirely plausible scenario that aligns with real-world expectations. The content reflects a natural and truthful setting with no misleading elements.\n\n" \
    "# Middle-Level Visual Consistency\n\n" \
    "## Consistent Lighting\n\n" \
    "The lighting across the image is coherent, with highlights and reflections consistently matching the direction of the light source.\n\n" \
    "## Compliance with Physical Constraints\n\n" \
    "The interactions and placements of objects adhere to physical laws, such as gravity and balance, ensuring that the scene is plausible in a real-world context.\n\n" \
    "## Consistent Perspective\n\n" \
    "The spatial relationships between elements are logical and free from distortion.\n\n" \
    "# Low-Level Pixel Statistics\n\n" \
    "## Cohesive Color Distribution\n\n" \
    "The colors and tones in the image are harmoniously distributed and align with the environment.\n\n" \
    "## Consistent Noise Patterns\n\n" \
    "The noise distribution across the image is uniform, with no abrupt changes or localized discrepancies that would indicate editing."
]

SYSTEM_PROMPT = "The following analysis supports the authenticity of the image, with observations categorized into high-level semantic coherence, middle-level visual consistency, and low-level pixel statistics.\
# High-Level Semantic Coherence\
Consistency with Common Sense\
[DETAILED-CAPTION]\
The content is entirely plausible, aligning with real-world expectations. The scene reflects a natural, truthful setting with no misleading elements.\
# Middle-Level Visual Consistency\
Consistent Lighting\
\
The lighting throughout the image is coherent, with shadows, highlights, and reflections properly aligned with the light source, creating a realistic appearance.\
Compliance with Physical Constraints\
\
All interactions and placements of objects adhere to physical laws, such as gravity and balance, ensuring that the scene is plausible in the real world.\
Consistent Perspective\
\
The spatial relationships between objects are logically arranged, with no distortion, and the size, scale, and orientation of elements align with natural perspective rules.\
# Low-Level Pixel Statistics\
Cohesive Color Distribution\
\
The colors and tones in the image are smooth and cohesive, with no abrupt transitions, ensuring a natural blend across the scene.\
Uniform Texture and Sharpness\
\
The texture and sharpness are evenly distributed, with no areas appearing artificially smoothed, grainy, or oversharpened.\
Consistent Noise Patterns\
\
The noise distribution across the image is uniform, with no localized discrepancies or abrupt changes that could suggest editing or manipulation."

In [2]:
data_set_path = "./datasets/CASIA2"
image_pth = data_set_path + "/images"
message = data_set_path + "/tampering_analysis"


In [3]:
import os

# 获取image_pth目录下的所有文件
image_files = [f for f in os.listdir(image_pth) if os.path.isfile(os.path.join(image_pth, f))]

# 获取message目录下的所有文件，
message_files = [f for f in os.listdir(message) if os.path.isfile(os.path.join(message, f))]

# 检查message中是否有对应的文件
for img in image_files:
    img_name = os.path.splitext(img)[0]  # 去除扩展名进行比较
    # 比较时忽略扩展名
    if any(os.path.splitext(mf)[0] == img_name for mf in message_files):
        # print(f"File '{img}' found in both image_pth and message.")
        pass
    else:
        print(f"File '{img}' not found in message.")

In [4]:
# 创建一个字典，将message_files中的文件名与对应的答案内容关联
answer_dict = {}

# 遍历message_files，读取每个文件的内容并填充到字典中
for message_file in message_files:
    message_path = os.path.join(message, message_file)  # 拼接消息文件的路径
    with open(message_path, 'r') as file:
        answer_content = file.read().strip()  # 假设每个文件包含一个文本答案
    # 去除扩展名并将文件内容作为答案存入字典
    answer_dict[os.path.splitext(message_file)[0]] = answer_content

## 定义base64转换函数

In [5]:
import base64
from io import BytesIO

def encode_image(image, quality=100):
    if image.mode != 'RGB':
        image = image.convert('RGB')  # Convert to RGB
    buffered = BytesIO()
    image.save(buffered, format="JPEG", quality=quality) 
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

由于我们在进行视觉微调时会同时使用文本和图像，因此我们将构建这些消息以包含这两种内容类型。对于每个训练样本，有关图像的问题将作为用户消息呈现，而相应的答案将作为辅助消息提供。

可以通过以下两种方式之一包含图像：

- 作为HTTP URL，引用图像的位置。
- 作为包含以base64编码的图像的数据 URL。
```python
{
    "messages": 
    [
        {
            "role": "system",
            "content": "Use the image to answer the question."
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What is the title of this book?"},
                {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,<encoded_image>"}}
            ]
        }
    ]
}
```

In [6]:
import os
from tqdm import tqdm
import pandas as pd

ds_train = pd.DataFrame({
    'image': image_files,
    'question': ["Analyze the image in a comprehensive and detailed manner."] * len(image_files),
    'answer': [answer_dict.get(os.path.splitext(img)[0], "Answer not found") for img in image_files]  # 根据图像文件名获取对应答案
})


In [7]:
ds_train

Unnamed: 0,image,question,answer
0,Tp_S_NNN_S_N_art00096_art00096_01004.tif,Analyze the image in a comprehensive and detai...,"Through our analysis, we have discovered sever..."
1,Tp_D_NRN_S_N_ind00064_ind00060_10945.jpg,Analyze the image in a comprehensive and detai...,"We have found several clues, where high-level ..."
2,Tp_S_NRN_M_N_sec00061_sec00061_11263.jpg,Analyze the image in a comprehensive and detai...,"We have found several clues, where high-level ..."
3,Tp_S_CNN_M_N_txt00006_txt00006_10839.jpg,Analyze the image in a comprehensive and detai...,"We have identified the following clues, where ..."
4,Tp_S_NNN_S_B_ani00014_ani00014_20058.jpg,Analyze the image in a comprehensive and detai...,"We have identified the following clues, where ..."
...,...,...,...
5118,Tp_S_NRN_S_N_art00093_art00093_10486.tif,Analyze the image in a comprehensive and detai...,"We have identified the following clues, where ..."
5119,Tp_D_NRN_S_N_sec00087_ani00037_00720.tif,Analyze the image in a comprehensive and detai...,"After a thorough examination, we have identifi..."
5120,Tp_S_NNN_M_N_ani00074_ani00074_10445.tif,Analyze the image in a comprehensive and detai...,"We have identified the following clues, where ..."
5121,Tp_S_NRD_S_N_ind00011_ind00011_01309.tif,Analyze the image in a comprehensive and detai...,"We have identified the following clues, where ..."


In [8]:
from sklearn.model_selection import train_test_split

# 假设ds_train是包含所有数据的原始数据集
# 打乱数据并划分为训练集、验证集和测试集
train_data, temp_data = train_test_split(ds_train, test_size=0.2, random_state=42)  # 80% 训练集
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 10% 验证集，10% 测试集

# 将划分后的数据集保存为新的变量
ds_train_train = train_data
ds_train_validation = validation_data
ds_train_test = test_data

# 打印每个数据集的大小
print(f"Training set size: {len(ds_train_train)}")
print(f"Validation set size: {len(ds_train_validation)}")
print(f"Test set size: {len(ds_train_test)}")

Training set size: 4098
Validation set size: 512
Test set size: 513


### 生成train

In [9]:
from PIL import Image

# 构建训练集
json_data = []

for idx, example in tqdm(ds_train_train.iterrows()):
    system_message = {
        "role": "system",
        "content": [{"type": "text", "text": SYSTEM_PROMPT}]
    }
    
    # 打开图像文件
    image_path = os.path.join(image_pth, example['image'])  # 拼接图像路径
    with Image.open(image_path) as img:
        # 编码图像
        encoded_image = encode_image(img, quality=50)

    user_message = {
        "role": "user",
        "content": [
            {"type": "text", "text": f"Question [{idx}]: {example['question']}"},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
        ]
    }
    
    assistant_message = {
        "role": "assistant",
        "content": [{"type": "text", "text": example["answer"]}]
    }

    all_messages = [system_message] + [user_message, assistant_message]
    
    json_data.append({"messages": all_messages})

4098it [00:12, 317.85it/s]


In [10]:
# save the JSON data to a file
with open("forgery-detection-CASIA2-train.jsonl", "w") as f:
    for message in json_data:
        json.dump(message, f)
        f.write("\n")

### 生成test， validation

In [11]:
# 构建验证集的JSON数据
json_data = []

for idx, example in tqdm(ds_train_validation.iterrows()):
    system_message = {
        "role": "system",
        "content": [{"type": "text", "text": SYSTEM_PROMPT}]
    }
    
    # 打开图像文件
    image_path = os.path.join(image_pth, example['image'])  # 拼接图像路径
    with Image.open(image_path) as img:
        # 编码图像
        encoded_image = encode_image(img, quality=50)

    user_message = {
        "role": "user",
        "content": [
            {"type": "text", "text": f"Question [{idx}]: {example['question']}"},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
        ]
    }

    assistant_message = {
        "role": "assistant",
        "content": [{"type": "text", "text": example["answer"]}]
    }

    all_messages = [system_message] + [user_message, assistant_message]
    
    json_data.append({"messages": all_messages})

# 保存验证集的JSON数据到文件
with open("forgery-detection-CASIA2-validation.jsonl", "w") as f:
    for message in json_data:
        json.dump(message, f)
        f.write("\n")

512it [00:01, 324.24it/s]


In [12]:
# 构建测试集的JSON数据
json_data = []

for idx, example in tqdm(ds_train_test.iterrows()):
    system_message = {
        "role": "system",
        "content": [{"type": "text", "text": SYSTEM_PROMPT}]
    }
    
    # 打开图像文件
    image_path = os.path.join(image_pth, example['image'])  # 拼接图像路径
    with Image.open(image_path) as img:
        # 编码图像
        encoded_image = encode_image(img, quality=50)

    user_message = {
        "role": "user",
        "content": [
            {"type": "text", "text": f"Question [{idx}]: {example['question']}"},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}
        ]
    }

    assistant_message = {
        "role": "assistant",
        "content": [{"type": "text", "text": example["answer"]}]
    }

    all_messages = [system_message] + [user_message, assistant_message]
    
    json_data.append({"messages": all_messages})

# 保存测试集的JSON数据到文件
with open("forgery-detection-CASIA2-test.jsonl", "w") as f:
    for message in json_data:
        json.dump(message, f)
        f.write("\n")

513it [00:01, 328.80it/s]
