pip install Pillow imagehash

### 完整流程 ###

1. 複製子目錄下所有圖片(jpg png jpeg)集中到一個資料夾，以原資料夾位置做檔名前綴用底線隔開\
e.g. reddit_antimeme_Abort_0eohs2f78se21.jpg

In [None]:
import os
import shutil

def copy_images(source_directory, target_directory):
    # 创建目标目录如果它不存在
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    # 遍历源目录和所有子目录
    for dirpath, dirnames, filenames in os.walk(source_directory):
        for filename in filenames:
            # 检查文件是否是图片
            if filename.lower().endswith(('.jpg', '.png', '.jpeg')):
                # 构建完整的文件路径
                file_path = os.path.join(dirpath, filename)
                # 构建唯一文件名，包括其源子目录信息
                relative_path = os.path.relpath(dirpath, source_directory)
                unique_filename = f"{relative_path.replace(os.sep, '_')}_{filename}"  # 将目录分隔符替换为下划线
                # 构建目标文件路径
                target_path = os.path.join(target_directory, unique_filename)
                # 复制文件
                shutil.copy(file_path, target_path)
                print(f'Copied "{file_path}" to "{target_path}"')

# 使用示例
source_directory = 'reddit'  # 源目录路径
target_directory = 'redditall/reddit'  # 目标目录路径

copy_images(source_directory, target_directory)


2. 把資料夾的裡的所有子目錄的csv集合成一個 並在最後新增每筆資料來源自哪個csv檔(csv檔的物件名稱)\
注意 根目錄 跟儲存 csv目錄

In [None]:
import pandas as pd
import os

def merge_csv_files(root_directory):
    all_data = []
    for subdir, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith('.csv') and file != 'error.csv':  # 排除名为 error.csv 的文件
                file_path = os.path.join(subdir, file)
                # 检查文件是否为空
                if os.stat(file_path).st_size > 0:  # 检查文件大小是否大于0
                    df = pd.read_csv(file_path)
                    if not df.empty:  # 检查 DataFrame 是否为空
                        df['Source CSV Path'] = file_path  # 存储完整的文件路径
                        all_data.append(df)
                else:
                    print(f"Skipped empty file: {file_path}")  # 打印空文件被跳过的消息

    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df.to_csv('redditall/combined_data.csv', index=False)

# 调用函数，输入你的根目录
merge_csv_files('reddit')


3. 整理csv的亂資料\
csv檔只留下URL欄位以.png .jpg .jpeg .gif結尾的資料

In [None]:
import pandas as pd

# 讀取 CSV 檔案
df = pd.read_csv('redditall/combined_data.csv')

# 假設 URL 欄位名稱為 'url_column'
# 過濾出 URL 以 .png, .jpg, .jpeg, .gif 結尾的資料
filtered_df = df[df['URL'].str.endswith(('.png', '.jpg', '.jpeg', '.gif'))]

# 將過濾後的資料存儲到新的 CSV 檔案
filtered_df.to_csv('redditall/combined_data.csv', index=False)


4. 用hash刪一些看起來幾乎一樣的，並用txt紀錄刪除的圖片

In [None]:
from PIL import Image
import imagehash
import os

def find_duplicates_with_phash(directory, hash_size=8):
    """使用感知哈希检查图片的视觉相似性"""
    hashes = {}
    duplicates = {}
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            filepath = os.path.join(directory, filename)
            img = Image.open(filepath)
            phash = imagehash.phash(img, hash_size)
            if phash in hashes:
                if phash not in duplicates:
                    duplicates[phash] = [hashes[phash]]
                duplicates[phash].append(filepath)
            else:
                hashes[phash] = filepath
    return duplicates


def delete_duplicates(duplicates):
    """删除重复的图片，只保留每组的第一张，且在用户确认后执行删除"""
    deleted_files = []
    for paths in duplicates.values():
        if len(paths) > 1:
            for path in paths[1:]:  # 保留第一张图片，等待用户确认后删除其余的图片
                print(f"Ready to delete: {path}")
    confirmation = input("Type 'ok' to confirm deletion: ")
    if confirmation.lower() == 'ok':
        for paths in duplicates.values():
            if len(paths) > 1:
                for path in paths[1:]:
                    os.remove(path)
                    deleted_files.append(path)
    return deleted_files

# 指定图片所在的目录
directory = 'redditall/reddit'
duplicates = find_duplicates_with_phash(directory)

# 输出所有视觉上相似的图片组
for key, paths in duplicates.items():
    if len(paths) > 1:
        print(f"Duplicate group based on perceptual hash {key}:")
        for path in paths:
            print(f"  {path}")

# 处理删除操作
deleted_files = delete_duplicates(duplicates)
if deleted_files:
    print(f"Deleted {len(deleted_files)} duplicate files:")
    for file in deleted_files:
        print(file)
    # 保存被删除文件的列表到文本文件
    with open('redditall/delet.txt', 'w') as file:
        for deleted_file in deleted_files:
            file.write(deleted_file + '\n')
else:
    print("No files were deleted.")

第四步結果：剩餘 1293 張圖片

5. 資料夾的所有圖片用GPT摘要兩句話，個別存成txt

In [None]:
#資料夾的所有圖片用GPT摘要兩句話，個別存成txt
import os
import base64
import openai

# 读取图片并转为base64编码
def image_to_base64(image_path):
    with open(image_path, "rb") as img_file:
        encoded_string = base64.b64encode(img_file.read()).decode("utf-8")
    return encoded_string

# 描述图片
def describe_image(image_path):
    base64_image = image_to_base64(image_path)
    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        temperature=0,
        messages=[
#             {"role": "assistant", "content": "請根據問題分別使用繁體中文和英文回覆"},
            {
                "role": "user",
                "content": [
                    {"type": "text", 
                     "text": "這是一張可能與墮胎相關的迷因，用兩句話解釋這張迷因"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    return response.choices[0].message.content

# 图片文件夹路径
folder_path = "redditall/reddit"
# 遍历文件夹中的所有jpg图片
for filename in os.listdir(folder_path):
    try:
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            description = describe_image(image_path)
            print(description)

            # 将描述保存到文本文件
            output_path = image_path.split('.')[0] + '.txt'
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(description)
    except Exception as e:
        print(e)
        


6. 手動分類\
顯示 圖片ID、文章標題、GPT API寫的兩句摘要、圖片，輸入分類編號點下ok會自動換下一張圖片，每分類完一張圖片就會存成csv檔，檔案裡有三個欄位：圖片ID、圖片路徑位置、分類編號，最後分類完所有圖片，得到一個有所有圖片的csv檔

In [None]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog, simpledialog, Label, Button
from PIL import Image, ImageTk
import os

class ImageReviewer(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title('图像评论工具')
        self.label = Label(self, text='点击“打开”开始评论图像。')
        self.label.pack(pady=20)
        
        self.open_button = Button(self, text='打开文件夹', command=self.open_directory)
        self.open_button.pack(pady=10)
        
        self.image_name_label = Label(self, text='', font=('Arial', 14))
        self.image_name_label.pack(pady=5)
        
        self.title_label = Label(self, text='', wraplength=400)
        self.title_label.pack(pady=5)
        
        self.article_label = Label(self, text='', wraplength=400)
        self.article_label.pack(pady=5)
        
        self.image_label = Label(self)
        self.image_label.pack(pady=10)
        
        self.data = pd.DataFrame(columns=['Image ID', 'File Path', 'User Input'])

    def load_articles(self):
        self.articles = pd.read_csv('redditall/combined_data.csv')

    def open_directory(self):
        directory = filedialog.askdirectory()
        self.images = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(('.jpg', '.png','.jpeg'))]
        self.images.sort() #按文件名排序
        self.load_articles()
        self.image_index = 0
        self.show_image()
        
    def show_image(self):
        if self.image_index < len(self.images):
            img_path = self.images[self.image_index]
            img_name, _ = os.path.splitext(os.path.basename(img_path))
            
            # 使用 '_' 分割 img_name 并获取最后一部分
            img_name = img_name.split('_')[-1]
        
            self.image_name_label.config(text=f"图片名称: {img_name}")  # 显示图片名称
            
            img = Image.open(img_path)
            img = img.resize((400, 400), Image.Resampling.LANCZOS)
            photo_img = ImageTk.PhotoImage(img)
            self.image_label.config(image=photo_img)
            self.image_label.image = photo_img  # 保留引用!
            
            mask = self.articles['URL'].str.contains(img_name, case=False, na=False)
            if mask.any():
                article_title = self.articles.loc[mask, 'Title'].values[0]
            else:
                article_title = "无相关文章标题"
            
            try:
                with open(os.path.join(os.path.dirname(img_path), img_name + '.txt'), 'r', encoding='utf-8') as file:
                    article_content = file.read()
            except FileNotFoundError:
                article_content = "未找到相关文本文件"
                print(f"未找到 {img_name}.txt 文件。")
                
            self.title_label.config(text=article_title)
            self.article_label.config(text=article_content)
            
            user_input = simpledialog.askstring("输入", "请输入您的评论:")
            new_row = pd.DataFrame([[img_name, img_path, user_input]], columns=['Image ID', 'File Path', 'User Input'])
            self.data = pd.concat([self.data, new_row], ignore_index=True)
            self.data.to_csv('redditall/classification.csv', index=False)
            
            print(f"Image ID: {img_name}, Comment: {user_input}")  # 打印图像 ID 和评论
            
            self.image_index += 1
            if self.image_index < len(self.images):
                self.after(1000, self.show_image)  # 推迟下一张图片的显示
            else:
                self.label.config(text='所有图像评论完成。数据已保存。')

app = ImageReviewer()
app.mainloop()