In [64]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm 
import pickle

# load extra 53 concepts 
path = "./Animacy/things_concepts.tsv"
df = pd.read_csv(path, sep="\t")

# extract 
image_path = "/projects/archiv/DataStore_Boyanova/ExpAtt_EEG/Image_dataset/Images"
extra_concepts = df.columns[2:].to_numpy()
concept_mat = np.isin(df.iloc[0:-1, 2:].to_numpy(), 1)
defenitions = df["Definition (from WordNet, Google, or Wikipedia)"][:-1].values




In [59]:
REORDER_RULES = {
    "camera": ["camera_lens", "camera1", "camera2"],
    "chicken": ["chicken_wire", "chicken1", "chicken2"],
    "crystal": ["crystal_ball", "crystal1", "crystal2"],
    "hot": ["hot_chocolate", "hot_tub", "hot-air_balloon", "hot-water_bottle"],
    "ice": ["ice_cream", "ice_cube", "ice_pack", "ice-cream_cone"],
    "pepper": ["pepper_mill", "pepper1", "pepper2"],
}

def reorder_categories_inplace(categories_os):
    categories = list(categories_os)  # copy

    for key, desired_order in REORDER_RULES.items():
        # indices where any of the target categories occur
        indices = [i for i, c in enumerate(categories) if c in desired_order]

        if not indices:
            continue

        # keep only those present, in desired order
        reordered = [c for c in desired_order if c in categories]

        # replace in the original span
        for idx, new_cat in zip(sorted(indices), reordered):
            categories[idx] = new_cat

    return categories


In [65]:
categories_os = sorted(os.listdir(image_path))
categories_os = reorder_categories_inplace(categories_os)

In [84]:
# -----------------------------
# Prompt template for animacy
def make_prompt(concept_name, definition):
    return f"""
Classify the following concept as either animate or inanimate based on the specific rules below.

### RULES:
- Animacy is a term which describes if a given concept is capable of self-movement. 
- Animate: Only includes humans (including human body parts) and animals (mammals, birds, fish, insects). These entities have self-directed movement and agency.
- Inanimate: Includes all physical objects, man-made tools, materials, and ALL vegetation.
- CRITICAL: Plants, trees, and flowers are biologically alive but must be classified as INANIMATE for this task.

### EXAMPLES:
Concept: Calf is always animate
Concept: Hammer -> inanimate
Concept: Oak tree -> inanimate
Concept: Grass -> inanimate
Concept: Human -> animate
Concept: Mosquito -> animate
Concept: Apple tree -> inanimate

### TASK:
Concept: {concept_name}
Defenition: {definition}

Answer with exactly one word (animate or inanimate):
"""

# -----------------------------
# Classification function
# -----------------------------
def classify_animacy(concepts_batch):
    prompts = [make_prompt(c) for c in concepts_batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=3,   # enough for "animate" or "inanimate"
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode outputs
    results = []
    for i, out in enumerate(outputs):
        decoded = tokenizer.decode(out, skip_special_tokens=True).strip().lower()
        # extract the first occurrence of "animate" or "inanimate"
        if "animate" in decoded:
            results.append("animate")
        elif "inanimate" in decoded:
            results.append("inanimate")
        else:
            results.append("unknown")
    return results


def dump_data(data, filename):
    """
    Serializes and saves data to a file using pickle.
    ------
    Args:
        data (any): The data to be serialized and saved.
        filename (str): The path to the file where the data will be saved.

    Returns:
        None
    """
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    

In [67]:
# LLama-3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "meta-llama/Llama-3.1-8B-Instruct"
cache_pt ="/projects/crunchie/boyanova/EEG_Things/Grouping-Embeddings/00_stim_prep/models"
# 1. Configure 4-bit quantization to save memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# 2. Load the model with device_map="auto"
# This automatically handles the memory split between GPU and CPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    cache_dir=cache_pt,
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_pt)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.61s/it]


In [68]:
# Force the tokenizer to use EOS as the padding token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Update the model's configuration so it knows about this change
model.config.pad_token_id = tokenizer.eos_token_id

In [69]:
import warnings
import transformers
warnings.filterwarnings("ignore")
# This silences the specific Transformers warnings
transformers.utils.logging.set_verbosity_error()

In [85]:
animate_llama = []
for c, d in tqdm(zip(categories_os, defenitions), total=len(categories_os)):
    # 1. Format the prompt using Llama 3.1's chat template
    messages = [
        {"role": "user", "content": make_prompt(c, d)},
    ]

    # This turns your text into the special format Llama 3.1 expects
    input_prompt = tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(model.device)

    # 2. Generate the response
    # We add a 'terminator' to make sure the model stops at the right time
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_prompt,
        max_new_tokens=512,      # How long you want the answer to be
        eos_token_id=terminators,
        do_sample=True,          # Set to True for creative answers
        temperature=0.5,         # Lower is more focused, higher is more creative
        top_p=0.9,
    )

    # 3. Decode and print the result
    response = outputs[0][input_prompt.shape[-1]:] # This removes the input prompt from the output
    animate_llama.append(tokenizer.decode(response, skip_special_tokens=True))

100%|██████████| 1854/1854 [07:12<00:00,  4.29it/s]


In [93]:
animate_llama = np.array(animate_llama)

animate_llama[np.isin(categories_os, "calf2")] = "animate"

llama_animate = np.array(categories_os)[animate_llama == "animate"]
llama_inanimate = np.array(categories_os)[animate_llama == "inanimate"]

In [96]:
data_dict = {"categories": categories_os,
             "animate_llama": animate_llama,
             "path": [os.path.join(image_path, cat) for cat in categories_os]}
df = pd.DataFrame(data_dict)

In [98]:
dump_data(data_dict, "./Animacy/llama_animacy.pkl")

In [99]:
inan = 1
an = 1
for idx in tqdm(range(len(df))):
    data = df.iloc[idx]
    files = len(os.listdir(data.path))
    if data.animate_llama == "animate":
        an = an + files
    else:
        inan = inan + files
        

100%|██████████| 1854/1854 [00:00<00:00, 24281.40it/s]
