# VISTA Dataset Generation

In [None]:
import torch
import os
import glob
from PIL import Image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

def find_image_in_subfolders(image_dir, filename):
    pattern = os.path.join(image_dir, "**", filename)
    path_candidates = glob.glob(pattern, recursive=True)
    if path_candidates:
        return path_candidates[0]
    return None

def qwen_test_few_shots(
    images_list_of_lists,
    prompts,
    model_name="Qwen/Qwen2-VL-7B-Instruct",
    image_dir="path_to_images_directory",
    resize_to=(224, 224),
    device_index=0
):
    device = torch.device(f"cuda:{device_index}" if torch.cuda.is_available() else "cpu")

    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16
    ).eval()

    for param in model.parameters():
        param.requires_grad = False

    processor = AutoProcessor.from_pretrained(model_name)

    model = model.to(device)

    conversation = []
    all_processed_images = []

    if len(images_list_of_lists) != len(prompts):
        raise ValueError(
            f"Mismatched lengths: got {len(images_list_of_lists)} image-turns "
            f"and {len(prompts)} prompt-turns."
        )

    for img_filenames, prompt_dict in zip(images_list_of_lists, prompts):
        role = prompt_dict.get("role", "user")  # default to "user"
        text = prompt_dict.get("text", "")

        turn_images = []
        for fn in img_filenames:
            path = find_image_in_subfolders(image_dir, fn)
            if path is None:
                print(f"Could not find file '{fn}' in '{image_dir}' or any subfolders.")
                continue

            try:
                raw_image = Image.open(path).convert("RGB")
                raw_image = raw_image.resize(resize_to, Image.LANCZOS)
                turn_images.append(raw_image)
            except Exception as e:
                print(f"Could not load image {path} due to error: {e}")
                continue

        print("Number of images used in this turn:", len(turn_images))

        all_processed_images.extend(turn_images)

        turn_content = [{"type": "image"} for _ in turn_images]
        if text:
            turn_content.append({"type": "text", "text": text})

        conversation.append({
            "role": role,
            "content": turn_content
        })

    if not all_processed_images:
        return [], "No valid images were processed."

    text_prompt = processor.apply_chat_template(
        conversation,
        add_generation_prompt=True
    )

    inputs = processor(
        text=[text_prompt],
        images=all_processed_images,
        return_tensors="pt",
        padding=True,
        max_length=256
    )
    inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            num_return_sequences=1,
            do_sample=False
        )

    generated_ids = [
        output_id[len(input_id):]
        for input_id, output_id in zip(inputs['input_ids'], output_ids)
    ]

    story_description = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]

    return all_processed_images, story_description


In [None]:
import os
import csv
import uuid
import matplotlib.pyplot as plt
from datetime import datetime

aspects = [
    "Structure: Generate the story that intrinsically have the classic five act structure of exposition, rising action, climax, falling action, and resolution. Do not literally include these 5 keywords in the generated story",
    "Setting: Focus on the location and setting of the image. Enhance how these locations and settings contribute to deeper narrative meanings in the story.",
    "Clarity: Clarity in writing means expressing ideas in a straightforward, precise, and unambiguous manner. It allows readers to grasp concepts quickly without needing to reread or decode complex structures.",
    "Cause-and-Effect: Discusses unity of action, emphasising cause-and-effect in storytelling.",
    "Consistency: Ensure a consistent traits aisnd motivations in the story to maintain the believability of the readers.Ensure that the characters' actions align with their established traits and motivations",
    "Relatability: Explores emotional drives that relates to the reader. Ensure the audience empathise with or understand the characters' experiences and emotions",
    "Development: Focus on a specific set of characters in the image and craft well-developed with distinct traits and motivations between them",
    "Comedy: Introduce a funny twist in the story with the intention of creating a laugh in the reader"
]

csv_input_filename = "combined_unique_id.csv"

story_data = []
with open(csv_input_filename, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        story_id = row["story_id"].strip()
        images_list = row["image_sequence"].split(",")
        images_list = [img.strip() for img in images_list]
        story_data.append((story_id, images_list))

today_date = datetime.now().strftime('%Y%m%d')
folder_name = f"{today_date}_multi_aspect_generate"
os.makedirs(folder_name, exist_ok=True)

csv_output_filename = os.path.join(folder_name, f"{folder_name}.csv")

def build_general_instruction(aspect_text):
    """
    Builds the 'system' prompt instruction for the given aspect.
    """
    return f"""You are an advanced assistant designed to create a 5-sentence story based on a 5-image sequence input.
Your task is to generate the appropriate text for each image input.
You are given a 5-sequence image and also an aspect to enhance.
This aspect will consist of the aspect name and definition, explaining how to express that particular aspect.
Your objective is to generate a 5-sentence story that expresses this aspect while still accurately visualizing the related image.
Vary between first-person and third-person viewpoints.
You can generate a named entity for the entities detected in the image.

Focused aspect: {aspect_text}

Generate a story based on the input image
"""

def build_evaluator_instruction(aspect_text, generated_story):
    """
    Builds the 'system' prompt instruction for evaluating how well
    the generated story expresses the aspect.
    """
    return f"""You are an advanced assistant tasked to evaluate a generated story.
Your task is to evaluate whether the given aspect has been successfully expressed in the story.
You will be given an aspect which will consist of the aspect name and definition, explaining how to express that particular aspect.
Your output will be one of these 3 options:
1) Fully agree, if you think that the story successfully expresses the aspect perfectly
2) Partially agree, if you think that the story contains the aspect but has flaws
3) Disagree, if you think that the story does not express the related aspect

Enhanced Aspect: {aspect_text}

Evaluate this story: \"{generated_story}\"
"""

with open(csv_output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "story_id",
        "aspect",
        "prompts",
        "generated_story",
        "enhanced_prompts",
        "enhanced_generated_story"
    ])

    for story_id, final_story_images in story_data:

        for aspect in aspects:

            general_instruction = build_general_instruction(aspect)

            prompts = []
            images_list_of_lists = []

            prompts.append({"role": "system", "text": general_instruction})

            images_list_of_lists.append(final_story_images)

            processed_images, story_response = qwen_test_few_shots(
                images_list_of_lists=images_list_of_lists,
                prompts=prompts,
                image_dir="my_images"
            )

            print(f"Generated Story (story_id={story_id}, aspect='{aspect}'):\n{story_response}\n")

            formatted_prompts = "\n".join(
                f"{idx+1}) Role: {entry['role'].capitalize()}, Text: {entry['text']}"
                for idx, entry in enumerate(prompts)
            )

            evaluator_instruction = build_evaluator_instruction(aspect, story_response)
            evaluation_prompts = [{"role": "system", "text": evaluator_instruction}]
            new_images_list_of_lists = [final_story_images]  # If needed

            processed_images_eval, evaluation_response = qwen_test_few_shots(
                images_list_of_lists=new_images_list_of_lists,
                prompts=evaluation_prompts,
                image_dir="my_images"
            )

            print(f"Evaluated Story:\n{evaluation_response}\n")

            formatted_evaluation_prompts = "\n".join(
                f"{idx+1}) Role: {entry['role'].capitalize()}, Text: {entry['text']}"
                for idx, entry in enumerate(evaluation_prompts)
            )

            writer.writerow([
                story_id,
                aspect,
                formatted_prompts,
                story_response,
                formatted_evaluation_prompts,
                evaluation_response
            ])

print(f"Exported data to CSV: {csv_output_filename}")


# VISTAScore

1) N-gram matching Scorer: measures lexical overlap to capture common word patterns linked to each aspect.

In [None]:
from collections import Counter
from math import exp, log
from typing import List, Tuple

def _ngrams(tokens: List[str], n: int) -> Counter:
    return Counter(tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1))

def _clipped_counts(
    ref_ngrams: List[Counter], cand_ngram_counts: Counter
) -> Tuple[int, int]:

    matched = 0
    for ngram, cand_count in cand_ngram_counts.items():
        max_ref_count = max(ref.get(ngram, 0) for ref in ref_ngrams)
        matched += min(cand_count, max_ref_count)
    return matched, sum(cand_ngram_counts.values())

def corpus_bleu_like(
    train_dataset: List[str],
    target_sentence: str,
    max_n: int = 4,
    smoothing: float = 1e-9,
) -> Tuple[List[float], float]:
    
    ref_tokens = [ref.split() for ref in train_dataset]
    cand_tokens = target_sentence.split()
    ref_lens = [len(r) for r in ref_tokens]
    cand_len = len(cand_tokens)
    
    refs_by_n = [
        [_ngrams(ref, n) for ref in ref_tokens]
        for n in range(1, max_n + 1)
    ]
    
    precisions = []
    for n in range(1, max_n + 1):
        cand_counts = _ngrams(cand_tokens, n)
        matched, total = _clipped_counts(refs_by_n[n - 1], cand_counts)
        precisions.append((matched + smoothing) / (total + smoothing))
    
    closest_ref_len = min(ref_lens, key=lambda rl: (abs(rl - cand_len), rl))
    if cand_len > closest_ref_len:
        bp = 1.0
    else:
        bp = exp(1 - closest_ref_len / (cand_len + 1e-9))
    
    log_prec_sum = sum(log(p) for p in precisions) / max_n
    bleu = bp * exp(log_prec_sum)
    
    return precisions, bleu

In [None]:
from statistics import mean
import pandas as pd

csv_path = "path_to_vista_dataset.csv"
train_df = pd.read_csv(csv_path)

cand_path = "path_to_machine_generated_stories.csv"
cand_df = pd.read_csv(cand_path)

refs_by_aspect = ( 
    train_df
    .dropna(subset=["generated_story", "aspect"])
    .groupby("aspect")["generated_story"]
    .apply(lambda col: col.astype(str).tolist())
    .to_dict()
)

num_aspects   = len(refs_by_aspect)
cand_df = cand_df.iloc[:500]                    # limit to first 500 rows
num_candidates = len(cand_df)

aspect_avg_scores = {}
all_records = []

for idx, (aspect, refs) in enumerate(refs_by_aspect.items(), 1):
    print(f"[{idx}/{num_aspects}] scoring {num_candidates} stories "
          f"against aspect: {aspect!r}")

    scores = []
    for j, story in enumerate(cand_df["generated_story"].astype(str), 1):
        if j % 100 == 0 or j == num_candidates:
            print(f"   …{j}/{num_candidates} done", end="\r")

        _, bleu = corpus_bleu_like(refs, story)
        scores.append(bleu)
        all_records.append({"aspect": aspect,
                            "generated_story": story,
                            "bleu_score": bleu})

    aspect_avg_scores[aspect] = mean(scores)
    print(f"   → aspect avg BLEU: {aspect_avg_scores[aspect]:.4f}")

print("\nAverage BLEU-like score per aspect:")
for asp, sc in sorted(aspect_avg_scores.items()):
    print(f"  {asp}: {sc:.4f}")

overall_avg = mean(aspect_avg_scores.values())
print(f"\nMean of aspect averages: {overall_avg:.4f}")

pd.DataFrame(all_records).to_csv("qwen_bleu_scores_all_aspects_vist.csv", index=False)
print("\nDetailed file saved to bleu_scores_all_aspects.csv")


2. Sentence-level Semantics Scorer: assesses similarity at the sentence level, accommodating different keywords or phrasing.

In [None]:
from statistics import mean
import pandas as pd
from bert_score import score

CSV_TRAIN   = "path_to_vista_dataset.csv"
CSV_CAND    = "path_to_machine_generated_stories.csv"
MODEL_TYPE  = "microsoft/deberta-large-mnli"

train_df = pd.read_csv(CSV_TRAIN)
cand_df  = pd.read_csv(CSV_CAND)

refs_by_aspect = (
    train_df
    .dropna(subset=["generated_story", "aspect"])
    .groupby("aspect")["generated_story"]
    .apply(list)
    .to_dict()
)

cands = cand_df["generated_story"].astype(str).tolist()[:500]

aspect_avg_scores = {}
all_records       = []

for idx, (aspect, refs) in enumerate(refs_by_aspect.items(), 1):
    print(f"[{idx}/{len(refs_by_aspect)}] scoring {len(cands)} stories "
          f"against aspect '{aspect}' ({len(refs)} refs)")

    refs_multi = [refs] * len(cands)

    _, _, f1 = score(
        cands,
        refs_multi,
        lang="en",
        model_type=MODEL_TYPE,
        verbose=True
    )

    for story, s in zip(cands, f1.tolist()):
        all_records.append({
            "aspect":          aspect,
            "generated_story": story,
            "bertscore_f1":    s
        })

    aspect_avg_scores[aspect] = f1.mean().item()
    print(f"   → aspect-avg BERTScore-F1: {aspect_avg_scores[aspect]:.4f}")

print("\nAverage BERTScore-F1 per aspect:")
for asp, sc in sorted(aspect_avg_scores.items()):
    print(f"  {asp}: {sc:.4f}")

overall_avg = mean(aspect_avg_scores.values())
print(f"\nMean of aspect averages: {overall_avg:.4f}")

OUT_CSV = "bertscore_F1_all_aspects_vista_qwen.csv"
pd.DataFrame(all_records).to_csv(OUT_CSV, index=False)
print(f"\nDetailed file saved to {OUT_CSV}")


3. Keyword-level Semantics Scorer: measures alignment with dominant aspect-specific keywords that strongly influence narrative structure.

In [None]:
from sklearn.preprocessing import normalize

import pandas as pd
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_distances
from sentence_transformers import SentenceTransformer

CSV_TRAIN = "path_to_vista_dataset.csv"

CSV_CAND = "path_to_machine_generated_stories.csv"

EMBEDDING_MODEL = "clip-ViT-B-32"

TOP_N = 10  # number of keywords per aspect

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
embedder = SentenceTransformer(EMBEDDING_MODEL)

df_train = (
    pd.read_csv(CSV_TRAIN)
      .dropna(subset=["generated_story", "aspect"])
      .reset_index(drop=True)
)

def preprocess(text: str) -> str:
    doc = nlp(text.lower())
    return " ".join(
        tok.lemma_ for tok in doc
        if tok.pos_ in {"NOUN", "VERB"} and tok.is_alpha and not tok.is_stop
    )

print(f"Loaded {len(df_train)} training stories.")
df_train["cleaned"] = df_train["generated_story"].map(preprocess)

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), min_df=5, max_df=0.8, stop_words="english"
)
X = vectorizer.fit_transform(df_train["cleaned"])
terms = vectorizer.get_feature_names_out()
print("Computed TF–IDF matrix.")

clf = LogisticRegression(max_iter=1000)
clf.fit(X, df_train["aspect"])
classes = clf.classes_
coefs = clf.coef_

distinctive_terms = {}
for i, aspect in enumerate(classes):
    idxs = np.argsort(coefs[i])[-TOP_N:]
    distinctive_terms[aspect] = terms[idxs].tolist()
print(f"Extracted top {TOP_N} distinctive keywords for {len(distinctive_terms)} aspects.")

aspect_embeddings = {}
print("Embedding distinctive keywords for each aspect...")
for i, (aspect, kws) in enumerate(distinctive_terms.items(), start=1):
    first_word = aspect.split()[0]
    print(f"[{i}/{len(distinctive_terms)}] Aspect '{first_word}' => {', '.join(kws)}")
    emb = embedder.encode(kws)
    aspect_embeddings[aspect] = np.mean(emb, axis=0)

df_cand = pd.read_csv(CSV_CAND).dropna(subset=["generated_story"]).reset_index(drop=True)
print(f"Loaded {len(df_cand)} candidate stories to evaluate.")
df_cand["cleaned"] = df_cand["generated_story"].map(preprocess)

stories = df_cand["cleaned"].tolist()
print("Embedding candidate stories...")
story_embs = embedder.encode(stories, show_progress_bar=True)

mean_distances = {}
print("Computing mean distances per aspect...")
for aspect, a_emb in aspect_embeddings.items():
    first_word = aspect.split()[0]
    dists = cosine_distances([a_emb], story_embs)[0]
    mean_distances[aspect] = np.mean(dists)
    print(f"Aspect '{first_word}': mean distance = {mean_distances[aspect]:.4f}")

print("\nDistinctive keywords per aspect:")
for aspect, kws in distinctive_terms.items():
    first_word = aspect.split()[0]
    print(f"{first_word}: {', '.join(kws)}")

print("\nFinal mean cosine distances to candidate stories per aspect:")
for aspect, dist in sorted(mean_distances.items(), key=lambda x: x[1]):
    first_word = aspect.split()[0]
    print(f"{first_word}: {dist:.4f}")

overall_mean = np.mean(list(mean_distances.values()))
print(f"\nOverall mean distance across all aspects: {overall_mean:.4f}")

pd.DataFrame(
    [(aspect.split()[0], dist) for aspect, dist in mean_distances.items()],
    columns=["aspect", "mean_distance"]
).to_csv("aspect_mean_distances.csv", index=False)
print("Saved mean distances to aspect_mean_distances.csv")