In [1]:
import pandas as pd
import json

from os.path import join
import os

In [2]:
concept2cat = {row.concept.lower() : row.category.lower() for i, row in  pd.read_excel("../data/best_human_exemplars.xlsx").iterrows()}
cat_en2it = {
    'animals': 'animali',
    'body parts': 'parti del corpo',
    'clothes': 'vestiti',
    'foods': 'cibi',
    'furnishings/fittings': 'arredamenti/accessori',
    'furniture': 'mobili',
    'hobbies': 'passatempi',
    'housing buildings': 'edifici residenziali',
    'kitchenware': 'utensili da cucina',
    'plants': 'piante',
    'stationery': 'cancelleria',
    'vehicles': 'veicoli'
}

In [3]:
def load_generations(base_dir, model, modality, language):
    temp = base_dir.split("/")[-1].split("_")[-1]
    if "temp_00" in base_dir:
        fname = f"{model}-{language}-{modality}-t00_iter1.json"
        data = [json.load(open(join(base_dir, fname)))]
    else:
        fnames = [f"{model}-{language}-{modality}-t{temp}_iter{i}.json" for i in range(1, 6)]
        data = [json.load(open(join(base_dir, fname))) for fname in fnames]

    return data

In [None]:
def create_dataframe(generations, model_name):
    dfs = []
    for i, generation in enumerate(generations):
        if "_exp_config" in generation: generation.pop("_exp_config") 
        df = pd.DataFrame([
            {"participant": f"{model_name}-iter{i+1}", "category": concept2cat[concept], "category_it": cat_en2it[concept2cat[concept]], "concept": concept, "exemplar": clean_exemplar(exemplar), "clean_exemplar": remove_concept(exemplar, concept), "rank": generation_rank}
                for concept, outer_dict in generation.items()
                for generation_rank, exemplar in outer_dict.items()
            ])
        dfs.append(df)
    concat_df = pd.concat(dfs, ignore_index=True)
    concat_df = concat_df.sort_values(by=["concept", "participant"], ascending=[True, True])
    return concat_df

def clean_exemplar(exemplar):
    exemplar = exemplar.rstrip(".")
    return exemplar

def remove_concept(exemplar, concept):
    exemplar = clean_exemplar(exemplar)
    plural_concept = pluralize(concept)
    if concept in exemplar:
        exemplar = exemplar.replace(concept, "", 1)
    elif plural_concept in exemplar:
        exemplar = exemplar.replace(plural_concept, "", 1)
    exemplar = exemplar.lstrip()
    return exemplar

def pluralize(word):
    if word.endswith('o'):
        return word[:-1] + "i"
    elif word.endswith("a"):
        return word[:-1] + "e"
    elif word.endswith("e"):
        return word[:-1] + "i"
    else:
        return word

In [9]:
def remove_invalid_exemplars(dataframe):
    slices = []
    for concept in dataframe.concept.unique():
        for participant in dataframe.participant.unique():
            slice = dataframe[(dataframe.concept == concept) & (dataframe.participant == participant)]
            slice = slice[slice.clean_exemplar != ""]
            slice["rank"] = [i + 1 for i in range(len(slice))]
            slices.append(slice)
    
    df = pd.concat(slices, ignore_index=True)
    df["rank"] = df["rank"].astype("int")
    
    return df

# main

In [10]:
temp = "temp_05"
base_dir = f"italian/{temp}"
out_dir = f"italian/formatted/{temp}"

model = "llava"
modality = "visual"
language = "it"

In [11]:
generations = load_generations(base_dir, model, modality, language)
gen_df = create_dataframe(generations, model_name=model)
clean_df = remove_invalid_exemplars(gen_df)

print(gen_df.shape, clean_df.shape)

out_fname = f"{model}-{language}-{modality}-{temp}_alliters.csv"

print(f"- storing results in: {out_dir}/{out_fname}")

os.makedirs(out_dir, exist_ok=True)
clean_df.to_csv(join(out_dir, out_fname), index=False)

(10719, 7) (10669, 7)
- storing results in: italian/formatted/temp_05/llava-it-visual-temp_05_alliters.csv


# Merge ALL freqs

In [1]:
import pandas as pd

In [4]:
all_freqs_p = "freqs/all/all-freqs-sketchengine.csv"
andrea_freqs = "freqs/all/all-freqs-sketchengine-andrea.csv"
giulia_freqs = "freqs/all/all-freqs-sketchengine-giulia.csv"
cate_freqs = "freqs/all/all-freqs-sketchengine-caterina.csv"

In [None]:
all_df = pd.read_csv(all_freqs_p)

# save a backup copy of original freqs
all_df.to_csv(f"{all_freqs_p}.bak", index=False)

andrea_df = pd.read_csv(andrea_freqs)
giulia_df = pd.read_csv(giulia_freqs)
cate_df = pd.read_csv(cate_freqs)


new_df = pd.concat([all_df, andrea_df, giulia_df, cate_df])
new_df = new_df.drop_duplicates(subset="exemplar")  # drop duplicated exemplars

new_df.to_csv(all_freqs_p, index=False)
