In [3]:
!pip install fasttext



In [1]:
import fasttext.util
import fasttext
import numpy as np
from scipy.spatial.distance import cdist

In [2]:
langs = {"en":"English", "af":"Afrikaans", "nl":"Dutch", "fr":"French", "pt":"Portuguese"}

# download pretrained ft models for languages of choice 
for lang_short in langs.keys():
    fasttext.util.download_model(lang_short, if_exists='ignore')  # English

In [7]:
# generate and store embeddings for all labels for the three datasets

for dataset in ["imagenet", "places-365", "ucf-101"]:
    ds_folder = f"data/{dataset}/"
    ds_wd_folder = ds_folder+"words/"
    ds_ft_folder = ds_folder+"fasttext/"


    !mkdir -p "$ds_ft_folder"
    
    dataset = "imagenet12988" if dataset == "imagenet" else dataset.replace("-", "")



    for lang_short, lang in langs.items():
        with open(ds_wd_folder+f"{dataset}-words-{lang}.txt", 'r') as f:
            labels = f.readlines()
        labels = [label.strip().replace("_", " ") for label in labels]    
        ft = fasttext.load_model(f'cc.{lang_short}.300.bin')

        embeddings = np.array([ft.get_sentence_vector(label) for label in labels]) #should .get_sentence_vector be used here or should we average out all the word embeddings?
        np.save(ds_ft_folder+f"fasttext-{dataset}-{lang}.npy", embeddings)



In [8]:
# generate and store embeddings for object-scene label pairs for the three datasets

for lang_short, lang in langs.items():
    with open(f"data/imagenet/words/imagenet12988-words-{lang}.txt", 'r') as f:
        objlabels = f.readlines()
    objlabels = [label.strip().replace("_", " ") for label in objlabels]
    with open(f"data/places-365/words/places365-words-{lang}.txt", 'r') as f:
        scelabels = f.readlines()
    scelabels = [label.strip().replace("_", " ") for label in scelabels]
#     print([objlabel+" "+scelabel for objlabel in objlabels for scelabel in scelabels][:10])
    ft = fasttext.load_model(f'cc.{lang_short}.300.bin')

    embeddings = np.array([ft.get_sentence_vector(objlabel+" "+scelabel) for objlabel in objlabels for scelabel in scelabels]) #should .get_sentence_vector be used here or should we average out all the word embeddings?
    np.save(f"data/imagenet/fasttext/fasttext-imagenet12988places365pairs-{lang}.npy", embeddings)



In [3]:
#
# Pair-wise similarity between (action and scene), (action and object), (object and object), (scene and scene), (scene and object) word embeddings.
#
def wtv_mapping(wtv1, wtv2):
    wtvmap = cdist(wtv1, wtv2, metric = "cosine")
    return 1 - wtvmap


for ds1, ds2 in [("imagenet", "places-365"), ("ucf-101", "places-365"), ("ucf-101", "imagenet"), ("imagenet", "imagenet"), ("places-365", "places-365")]:
    ds1_ft_folder = f"data/{ds1}/fasttext/"
    ds2_ft_folder = f"data/{ds2}/fasttext/"
    
    ds1 = "imagenet12988" if ds1 == "imagenet" else ds1.replace("-", "")
    ds2 = "imagenet12988" if ds2 == "imagenet" else ds2.replace("-", "")
    
    for lang_short, lang in langs.items():
        ds1_emb = np.load(ds1_ft_folder+f"fasttext-{ds1}-{lang}.npy")
        ds2_emb = np.load(ds2_ft_folder+f"fasttext-{ds2}-{lang}.npy")
        
        emb2emb = wtv_mapping(ds1_emb, ds2_emb)
        
        corr = {"imagenet12988":"o", "places365":"s", "ucf101":"a"}
        
        np.save(ds1_ft_folder+f"{corr[ds1]}2{corr[ds2]}_ft_{ds2}_{lang}.npy", emb2emb)

In [11]:
# computing pairwise similarity for object-scene pairs and actions
for lang_short, lang in langs.items():
    ds1_emb = np.load(f"data/ucf-101/fasttext/fasttext-ucf101-{lang}.npy")
    ds2_emb = np.load(f"data/imagenet/fasttext/fasttext-imagenet12988places365pairs-{lang}.npy")


    emb2emb = wtv_mapping(ds2_emb, ds1_emb)

    np.save(f"data/ucf-101/fasttext/a2ospairs_ft_imagenet12988places365pairs_{lang}.npy", emb2emb.T)