In [8]:
import sys
sys.path.insert(0, "./fake_flash_attn")

In [None]:
pip install -U "transformers==4.40.2" "peft==0.10.0"

In [None]:
pip install einops timm

In [2]:
import transformers, peft
print(transformers.__version__)
print(peft.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.40.2
0.10.0


In [3]:
import os, csv, json
from tqdm import tqdm
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import PeftModel


In [8]:
# ==== Best模型目录列表 ====
best_model_dirs = [
    #"D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr5e-06_ep15/best",
    #"D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr2e-05_ep15/best",
    #"D:/ucl-dissertation/florence2basetrain/florence2-lora-bs8_lr5e-06_ep15/best",
    #"D:/ucl-dissertation/florence2basetrain/florence2-lora-bs2_lr1e-05_ep15/best",
    "D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr1e-05_ep30_sp/best",
]

# ==== 图片地址 ====
img_dir = r"D:\ucl-dissertation\mapillary\processed"

# ==== 验证集数据 ====
val_jsonl = r"D:\ucl-dissertation\mapillary\captions_val_3.jsonl"
val_ds = []
with open(val_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        val_ds.append(json.loads(line))

In [9]:
# 加载florence-2-base模型

CKPT = "microsoft/Florence-2-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [10]:
# ==== 推理函数 ====
def infer_caption(model, processor, img_path, device):
    img = Image.open(img_path).convert("RGB")
    prompt = "<CAPTION>"
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs.input_ids,
            pixel_values=inputs.pixel_values,
            max_new_tokens=80,
            num_beams=3
        )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return output

for model_dir in best_model_dirs:
    print(f"Processing {model_dir} ...")
    # === 先 base model，再 LoRA adapter ===
    model = AutoModelForCausalLM.from_pretrained(CKPT, trust_remote_code=True).to(DEVICE)
    model = PeftModel.from_pretrained(model, model_dir)
    processor = AutoProcessor.from_pretrained(CKPT, trust_remote_code=True)

    param_name = os.path.basename(os.path.dirname(model_dir))
    out_csv = f"{param_name}_val_predict.csv"
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["image", "description"])
        for item in tqdm(val_ds, desc=f"{param_name} val infer"):
            img_name = item["image"]
            img_path = os.path.join(img_dir, img_name)
            desc = infer_caption(model, processor, img_path, DEVICE)
            writer.writerow([img_name, desc])
    print(f"Saved {out_csv}")

Processing D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr1e-05_ep30_sp/best ...


florence2-lora-bs4_lr1e-05_ep30_sp val infer: 100%|██████████| 40/40 [03:06<00:00,  4.66s/it]

Saved florence2-lora-bs4_lr1e-05_ep30_sp_val_predict.csv





In [None]:
pip install pandas

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import csv
import pandas as pd
from tqdm import tqdm

In [3]:
# ======= Step 1: Load FLAN-T5 Model =======
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# ======= Step 2: Prompt Template =======
def build_prompt(description: str) -> str:
    return f"""
You are given a description of a storefront. Classify it into one of the following five categories:

1. Retail – includes clothing stores, electronics shops, supermarkets, convenience stores, and home goods stores.
2. Food & Beverage – includes restaurants, cafés, bakeries, takeaway shops, fast food vendors, and bars.
3. Service-Oriented – includes pharmacies, optical stores, hair salons, internet cafés, photo studios, and financial offices.
4. Industrial & Repair – includes hardware stores, car repair shops, electrical suppliers, refrigeration services, and workshops.
5. Other – used for storefronts that cannot be identified due to blurry photos, unclear signage, or vague descriptions.

Answer with only the category number (1 to 5).

Store description: "{description}"  
""".strip()

# ======= Step 3: Load CSV File =======
csv_path = r"D:\ucl-dissertation\mapillary\florence2-lora-bs4_lr1e-05_ep30_sp_val_predict.csv"
df = pd.read_csv(csv_path)

# Ensure columns exist
assert "image" in df.columns and "description" in df.columns, "CSV must contain 'image' and 'description' columns."

results = []

# ======= Step 4: Predict Loop =======
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Classifying"):
    image = row["image"]
    description = row["description"]
    prompt = build_prompt(description)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(
        **inputs,
        max_length=10,
        num_beams=5,
        do_sample=False,
        early_stopping=True,
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    try:
        pred = int(decoded)
        if pred not in range(1, 6):
            pred = 5
    except:
        pred = 5

    results.append({
        "image": image,
        "predicted_class": pred
    })

# ======= Step 5: Save Results =======
output_path = "flan_t5_bs4_lr1e-05_ep30_sp.csv"
with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["image", "predicted_class"])
    writer.writeheader()
    writer.writerows(results)

print(f"Saved predictions to {output_path}")

Classifying: 100%|██████████| 40/40 [01:41<00:00,  2.54s/it]

Saved predictions to flan_t5_bs4_lr1e-05_ep30_sp.csv





### 合并模型权重

In [6]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoProcessor

In [9]:
# Base model 和 adapter
CKPT = "microsoft/Florence-2-base"
adapter_path = "D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr1e-5_ep30_plus15/best"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === 加载 base model 和 LoRA adapter ===
model = AutoModelForCausalLM.from_pretrained(CKPT, trust_remote_code=True)
model = PeftModel.from_pretrained(model, adapter_path)

# === 合并 LoRA 参数 ===
model = model.merge_and_unload()

# === 保存完整模型 ===
save_path = "D:/ucl-dissertation/florence_ft_result/florence2-lora-plus"
model.save_pretrained(save_path)

# 保存 processor（可选）
processor = AutoProcessor.from_pretrained(CKPT, trust_remote_code=True)
processor.save_pretrained(save_path)

print("✅ 完整模型已保存")




✅ 完整模型已保存


In [10]:
# Base model 和 adapter
CKPT = "microsoft/Florence-2-base"
adapter_path = "D:/ucl-dissertation/florence2basetrain/florence2-lora-bs4_lr1e-05_ep30_sp/best"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === 加载 base model 和 LoRA adapter ===
model = AutoModelForCausalLM.from_pretrained(CKPT, trust_remote_code=True)
model = PeftModel.from_pretrained(model, adapter_path)

# === 合并 LoRA 参数 ===
model = model.merge_and_unload()

# === 保存完整模型 ===
save_path = "D:/ucl-dissertation/florence_ft_result/florence2-lora-spcaption"
model.save_pretrained(save_path)

# 保存 processor（可选）
processor = AutoProcessor.from_pretrained(CKPT, trust_remote_code=True)
processor.save_pretrained(save_path)

print("✅ 完整模型已保存")



✅ 完整模型已保存


## 测试

In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image

In [12]:
# 设备设置
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 你的完整模型路径（合并后保存的路径）
model_path = r"D:/ucl-dissertation/florence_ft_result/florence2-lora-plus"

# 加载模型和处理器
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(DEVICE)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

def infer_caption(model, processor, img_path, device):
    img = Image.open(img_path).convert("RGB")
    prompt = "<DETAILED_CAPTION>"
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs.input_ids,
            pixel_values=inputs.pixel_values,
            max_new_tokens=80,
            num_beams=3
        )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return output

# 图片路径
img_path = r"D:\ucl-dissertation\mapillary\processed\image_001.jpg"

# 推理
caption = infer_caption(model, processor, img_path, DEVICE)
print("生成的描述：", caption)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


生成的描述： A small shop with a sign reading "Ferrelectricos" and a large electrical wire inside. The shop appears to be a hardware store, likely used for electrical equipment.


### 进行正式推理

In [None]:
import sys
sys.path.insert(0, "./fake_flash_attn")

In [None]:
import os
import csv
import torch
import threading
from queue import Queue
from tqdm import tqdm
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

In [None]:
# ==== 配置部分 ====
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_path = r"D:/ucl-dissertation/florence_ft_result/florence2-lora-plus"
base_img_dir = r"D:\ucl-dissertation\commercial images\set1"  # 存放 imageoutputX 文件夹的路径
output_prefix = "set1_imageoutput"  # 输出 CSV 的文件名前缀
num_threads = 8  # CPU 线程数

In [None]:
# ==== 加载模型（只加载一次） ====
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(DEVICE)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# ==== 推理函数 ====
def infer_caption(img_path):
    img = Image.open(img_path).convert("RGB")
    prompt = "<DETAILED_CAPTION>"
    inputs = processor(text=prompt, images=img, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs.input_ids,
            pixel_values=inputs.pixel_values,
            max_new_tokens=80,
            num_beams=3
        )
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return os.path.basename(img_path), output

# ==== 处理单个文件夹 ====
def process_folder(folder_name):
    img_dir = os.path.join(base_img_dir, folder_name)
    out_csv = os.path.join(base_img_dir, f"{output_prefix}{folder_name[-1]}.csv")

    img_files = [os.path.join(img_dir, f) for f in os.listdir(img_dir)
                 if f.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".webp"))]

    q = Queue()
    results = []
    pbar = tqdm(total=len(img_files), desc=f"{folder_name} 推理进度")

    def worker():
        while True:
            img_path = q.get()
            if img_path is None:
                break
            try:
                result = infer_caption(img_path)
            except Exception as e:
                result = (os.path.basename(img_path), f"Error: {e}")
            results.append(result)
            q.task_done()
            pbar.update(1)

    threads = []
    for _ in range(num_threads):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for img_file in img_files:
        q.put(img_file)

    q.join()

    for _ in range(num_threads):
        q.put(None)
    for t in threads:
        t.join()

    pbar.close()

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["image", "description"])
        for item in results:
            writer.writerow(item)

    print(f"✅ {folder_name} 处理完成，结果保存到 {out_csv}")

# ==== 循环处理 imageoutput1-7 ====
for i in range(1, 8):
    process_folder(f"imageoutput{i}")

print("🎯 所有文件夹处理完成！")

推理进度: 100%|██████████| 639/639 [55:43<00:00,  5.23s/it]  

✅ CPU多线程推理完成，结果保存到 D:\ucl-dissertation\commercial images\set1\set1_imageoutput7.csv





## flan-t5

In [31]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import csv
import pandas as pd
from tqdm import tqdm

In [33]:
# ======= Step 1: Load FLAN-T5 Model =======
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# ======= Step 2: Prompt Template =======
def build_prompt(description: str) -> str:
    return f"""
You are given a description of a storefront. Classify it into one of the following five categories:

1. Retail – includes clothing stores, electronics shops, supermarkets, convenience stores, and home goods stores.
2. Food & Beverage – includes restaurants, cafés, bakeries, takeaway shops, fast food vendors, and bars.
3. Service-Oriented – includes pharmacies, optical stores, hair salons, internet cafés, photo studios, and financial offices.
4. Industrial & Repair – includes hardware stores, car repair shops, electrical suppliers, refrigeration services, and workshops.
5. Other – used for storefronts that cannot be identified due to blurry photos, unclear signage, or vague descriptions.

Answer with only the category number (1 to 5).

Store description: "{description}"  
""".strip()

# ======= Step 3: Load CSV File =======
csv_path = r"D:\ucl-dissertation\commercial images\set1\set1_imageoutput_merged.csv"
df = pd.read_csv(csv_path)

# Ensure columns exist
assert "image" in df.columns and "description" in df.columns, "CSV must contain 'image' and 'description' columns."

results = []

# ======= Step 4: Predict Loop =======
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Classifying"):
    image = row["image"]
    description = row["description"]
    prompt = build_prompt(description)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(
        **inputs,
        max_length=10,
        num_beams=5,
        do_sample=False,
        early_stopping=True,
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    try:
        pred = int(decoded)
        if pred not in range(1, 6):
            pred = 5
    except:
        pred = 5

    results.append({
        "image": image,
        "predicted_class": pred
    })

# ======= Step 5: Save Results =======
output_path = r"D:\ucl-dissertation\commercial images\set1\set1_flan.csv"
with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["image", "predicted_class"])
    writer.writeheader()
    writer.writerows(results)

print(f"Saved predictions to {output_path}")

Classifying: 100%|██████████| 4219/4219 [3:10:28<00:00,  2.71s/it]  

Saved predictions to D:\ucl-dissertation\commercial images\set1\set1_flan.csv





In [30]:
# 合并csv

import pandas as pd
import os

# CSV文件所在文件夹
csv_folder = r"D:\ucl-dissertation\commercial images\set1"

# 要合并的CSV文件名列表（对应你的文件名格式）
csv_files = [
    "set1_imageoutput1.csv",
    "set1_imageoutput2.csv",
    "set1_imageoutput3.csv",
    "set1_imageoutput4.csv",
    "set1_imageoutput5.csv",
    "set1_imageoutput6.csv",
    "set1_imageoutput7.csv"
]

dfs = []
for file in csv_files:
    path = os.path.join(csv_folder, file)
    df = pd.read_csv(path)  # 默认header=0，会把第一行当标题
    dfs.append(df)

# 合并所有DataFrame，默认忽略原索引，重置索引
merged_df = pd.concat(dfs, ignore_index=True)

# 保存合并后的CSV文件
output_path = os.path.join(csv_folder, "set1_imageoutput_merged.csv")
merged_df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"✅ 合并完成，保存到 {output_path}")


✅ 合并完成，保存到 D:\ucl-dissertation\commercial images\set1\set1_imageoutput_merged.csv
