### Import Libraries

In [None]:
import importlib
import os
import json
import sys
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

from models import qwen

importlib.reload(qwen)


In [None]:
BATCH_SIZE = 4  # Small number to avoid RPM, TPM limits
QWEN_OUTPUT_JSON_PATH = "../../data/results/qwen_dashscope_results.json"
QWEN_METADATA_OUTPUT_JSON_PATH = "../../data/results/qwen_dashscope_metadata.json"
SYSTEM_PROMPT_PATH = "../../data/prompts/video_understanding_qwen_system.txt"
USER_PROMPT_PATH = "../../data/prompts/video_understanding_qwen_user.txt"

### Get list with video urls

In [3]:
video_urls_path = "../../data/metadata/video_links_labels.csv"
video_urls = pd.read_csv(video_urls_path)
video_urls.head(3)

Unnamed: 0,id,download_url,binary_label,multilabel
0,A.Beautiful.Mind.2001__#00-01-45_00-02-50_label_A,https://huggingface.co/datasets/jherng/xd-viol...,0,0
1,A.Beautiful.Mind.2001__#00-03-00_00-04-05_label_A,https://huggingface.co/datasets/jherng/xd-viol...,0,0
2,A.Beautiful.Mind.2001__#00-04-20_00-05-35_label_A,https://huggingface.co/datasets/jherng/xd-viol...,0,0


In [4]:
video_urls_list = video_urls["download_url"].to_list()
video_urls_list[2]

'https://huggingface.co/datasets/jherng/xd-violence/resolve/main/data/video/1-1004/A.Beautiful.Mind.2001__%2300-04-20_00-05-35_label_A.mp4'

### Run Qwen

In [None]:
# Get prompt
with open(SYSTEM_PROMPT_PATH, "r", encoding="utf-8") as f:
    system_prompt = f.read()
with open(USER_PROMPT_PATH, "r", encoding="utf-8") as f:
    user_prompt = f.read()

In [6]:
# Initiate instance
qwen_model = qwen.QwenAsyncRequester(
    dashcope_api_key=os.getenv("DASHSCOPE_API_KEY"),
    together_api_key=os.getenv("TOGETHER_API_KEY"),
    model_name="qwen3-vl-235b-a22b-instruct",
)

In [9]:
async def call_qwen(video_urls_list, batch_size):
    for i in range(0, len(video_urls_list), BATCH_SIZE):
        batch = video_urls_list[i : i + BATCH_SIZE]
        print(f"Processing batch {i // BATCH_SIZE + 1} ({len(batch)} videos)...")
        results = await qwen_model.process_videos(
            video_urls=batch,
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            batch_size=batch_size,  # concurrency within each batch
            output_json=QWEN_OUTPUT_JSON_PATH,
            overwrite=False,
            append=True,
        )
        print(f"Batch {i // BATCH_SIZE + 1} done, {len(results)} results appended.")

    print("All batches processed. Results saved in:", QWEN_OUTPUT_JSON_PATH)

In [10]:
await call_qwen(video_urls_list, batch_size=2)

Processing batch 1 (4 videos)...
Batch 1 done, 3494 results appended.
Processing batch 2 (4 videos)...
Batch 2 done, 3498 results appended.
Processing batch 3 (4 videos)...
Batch 3 done, 3502 results appended.
Processing batch 4 (4 videos)...
Batch 4 done, 3506 results appended.
Processing batch 5 (4 videos)...
Batch 5 done, 3510 results appended.
Processing batch 6 (4 videos)...
Batch 6 done, 3514 results appended.
Processing batch 7 (4 videos)...
Batch 7 done, 3518 results appended.
Processing batch 8 (4 videos)...
Batch 8 done, 3522 results appended.
Processing batch 9 (4 videos)...
Batch 9 done, 3526 results appended.
Processing batch 10 (4 videos)...
Batch 10 done, 3530 results appended.
Processing batch 11 (4 videos)...
Batch 11 done, 3534 results appended.
Processing batch 12 (4 videos)...
Batch 12 done, 3538 results appended.
Processing batch 13 (4 videos)...
Batch 13 done, 3542 results appended.
Processing batch 14 (4 videos)...
Batch 14 done, 3546 results appended.
Processing

In [None]:
# Load the original results JSON
with open(QWEN_OUTPUT_JSON_PATH, "r", encoding="utf-8") as f:
    all_results = json.load(f)


# Filter: keep entries without errors OR with "data_inspection_failed" error
def should_keep_entry(entry):
    # Keep if no error
    if "error" not in entry or not entry["error"]:
        return True

    # Keep if error is "data_inspection_failed"
    error_text = entry.get("error", "")
    if "data_inspection_failed" in error_text or "inappropriate content" in error_text:
        return True

    # Discard all other errors
    return False


clean_results = [r for r in all_results if should_keep_entry(r)]

# Save the clean results to a new JSON file
CLEAN_JSON_PATH = "../../data/results/qwen_dashscope_results_clean.json"
with open(CLEAN_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(clean_results, f, indent=2, ensure_ascii=False)

print(f"Clean results saved to {CLEAN_JSON_PATH}. {len(clean_results)} entries remain.")

# Count entries by type
no_error_count = sum(1 for r in clean_results if not r.get("error"))
data_inspection_count = sum(
    1
    for r in clean_results
    if r.get("error") and "data_inspection_failed" in r.get("error", "")
)

print(f"  - {no_error_count} successful entries")
print(f"  - {data_inspection_count} entries with 'data_inspection_failed' error")

# Get all processed URLs from the clean results
processed_urls = set(r["video_url"] for r in clean_results if "video_url" in r)

# Remove processed URLs from your video_urls_list
video_urls_list = [url for url in video_urls_list if url not in processed_urls]

print(f"{len(video_urls_list)} URLs remain to process.")

Clean results saved to ../../data/results/qwen_dashscope_results_clean.json. 3894 entries remain.
  - 3654 successful entries
  - 240 entries with 'data_inspection_failed' error
56 URLs remain to process.


In [12]:
qwen_model.parse_and_save_transcripts(
    input_json_path="../../data/results/qwen_dashscope_results_clean.json",
    output_json_path="../../data/results/qwen_dashscope_transcripts_final.json",
    metadata_json_path=QWEN_METADATA_OUTPUT_JSON_PATH,
)