In [1]:
import torch
import time
import re
import os
from experiment import *
from PIL import Image

In [None]:
 from europarl_preprocessing import write_new_files

In [None]:
tsz = 500000
vate_size = int(tsz/4)
write_new_files(tsz, vate_size, vate_size, max_sentence_length=50)

In [5]:
def get_number_lines(path):
    with open(path, 'r') as file:
        lines = file.readlines()
    return len(lines)

In [6]:
path = os.path.join('gcc','train_sentences.txt')
coco_path = os.path.join('coco','train_sentences.txt')
europarl_train = get_number_lines(path)

In [7]:
path = os.path.join('gcc','big_train_file.txt')
train_size = get_number_lines(path)
print(train_size)

2654581


In [8]:
path = os.path.join('gcc','big_test_file.txt')
train_size = get_number_lines(path)
print(train_size)

331799


In [9]:
path = os.path.join('gcc','big_valid_file.txt')
train_size = get_number_lines(path)
print(train_size)

331768


In [2]:
images = ['Margarita-Bugueno.jpg', 'tree.jpeg', 'poodle.jpg','flower.jpg']
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
def get_images(images, clip_model, preprocess):
    N = len(images)
    count = 0
    image_features = torch.empty(size=(N, 512))
    for i,image_id in enumerate(images):
        count += 1
        im = Image.open(image_id)        
        image = preprocess(im).unsqueeze(0).to(device)
        with torch.no_grad():
            clip_image = clip_model.encode_image(image)
            image_features[i] = clip_image
    return image_features

In [4]:
images_features = get_images(images,clip_model,preprocess)
images_features.shape

torch.Size([4, 512])

In [36]:
captions = ["A cute little dog", "Un perrito bonito", "Uroczy, mały piesek", 
            "Un petit chien mignon", "Ein süßer kleiner Hund", "Un piccolo cane carino"]

In [5]:
captions = ["A yellow flower","Una flor amarilla","Zolty Kwiatek","Une fleur jaune", "Eine gelbe Blume","un fiore giallo"]

In [14]:
clip_features = get_clip_embeddings(captions,clip_model).to(device)
clip_features.shape

torch.Size([6, 512])

In [15]:
for i, clip_feature in enumerate(clip_features):
    print(captions[i])
    logits_image_clip, logits_text_clip = get_logits(images_features, clip_feature)
    probs_clip = logits_image_clip.softmax(dim=-1).to('cpu').detach().numpy()
    print(probs_clip)

A yellow flower
[0.11420055 0.10807043 0.07017227 0.70755666]
Una flor amarilla
[0.19993378 0.0851327  0.12104443 0.5938891 ]
Zolty Kwiatek
[0.31335264 0.18450633 0.19757947 0.30456156]
Une fleur jaune
[0.17498663 0.12924773 0.08789355 0.60787207]
Eine gelbe Blume
[0.15191233 0.15184061 0.11434212 0.58190495]
un fiore giallo
[0.1557688  0.13171895 0.09323486 0.6192774 ]


In [6]:
from experiment import sbert_to_clip
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
torch_features = get_sbert_embeddings(captions,sbert_model) 
torch_features.shape

torch.Size([6, 512])

In [12]:
sbert_features = sbert_to_clip(torch_features,'gcc_NN_750_e300_s2654581.pt')
sbert_features.shape

torch.Size([6, 512])

In [8]:
images_features.shape

torch.Size([4, 512])

In [13]:
for i, sbert_feature in enumerate(sbert_features):
    print(captions[i])
    logits_image_sbert, logits_text_sbert = get_logits(images_features, sbert_feature)
    probs_sbert = logits_image_sbert.softmax(dim=-1).to('cpu').detach().numpy()
    print(probs_sbert)

A yellow flower
[0.25561073 0.17334817 0.11221261 0.45882848]
Una flor amarilla
[0.26155874 0.17659798 0.11643499 0.4454083 ]
Zolty Kwiatek
[0.24662964 0.17836365 0.12518138 0.44982538]
Une fleur jaune
[0.2529453  0.17863216 0.11457665 0.4538458 ]
Eine gelbe Blume
[0.27591568 0.16273409 0.12072165 0.44062856]
un fiore giallo
[0.2499876  0.18935394 0.12197234 0.4386862 ]


In [32]:
for i, im_feature in enumerate(images_features):
    print(captions[i])
    logits_image_clip, logits_text_clip = get_logits(im_feature, clip_features)
    probs_clip = logits_image_clip.softmax(dim=-1).to('cpu').detach().numpy()
    print(probs_clip)

Margarita-Bugueno.jpg
[0.6583897  0.1578371  0.18377325]
tree.jpeg
[0.14538383 0.6917188  0.16289736]
poodle.jpg
[0.24891849 0.14688429 0.60419714]


In [8]:
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
directory = 'europarl'

def regexification(sentences):
    regex = [r"[^A-Za-z0-9]+|[a-zA-Z][0-9]", r"(?<!\d)[0]\d*(?!\d)", r"\s+", r"[0-9]+"]
    for r in regex:
        sentences = list(map(lambda sentence: re.sub(r, " ", sentence), sentences))
    return sentences

def get_files_paths(directory):
    train_txt = 'train_sentences.txt'    
    valid_txt = 'valid_sentences.txt'
    test_txt = 'test_sentences.txt'
    train_path = os.path.join(directory,train_txt)    
    valid_path = os.path.join(directory,valid_txt)
    test_path = os.path.join(directory,test_txt)
    return train_path, valid_path, test_path

def get_sentences_from_file(filename):
    sentences = []
    with open(filename, mode='rt', encoding='utf-8') as file_object:
        for line in file_object:
            sentences.append(line)    
    return sentences

def get_clip_embeddings(sentences, batch_size=32):
    tokenized_text = clip.tokenize(sentences).to(device)
    with torch.no_grad():
        clip_embeddings_list = []
        for i in range(0,tokenized_text.size()[0],batch_size):
            tok_batch = tokenized_text[i:i+batch_size]
            clip_embeddings_batch = clip_model.encode_text(tok_batch).to(device)
            for unity in clip_embeddings_batch:
                clip_embeddings_list.append(unity)
    final_emb = torch.stack(clip_embeddings_list)
    return final_emb

In [None]:
train_file, valid_file, test_file = get_files_paths(directory)
train_len = get_number_lines(train_file)
valid_len = get_number_lines(valid_file)
test_len = get_number_lines(test_file)
print([train_len, valid_len,test_len])

In [9]:
clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()
print("done")

done


In [None]:
train_sentences = regexification(get_sentences_from_file(train_file))
valid_sentences = regexification(get_sentences_from_file(valid_file))
train_clip_embeddings = get_clip_embeddings(train_sentences)
valid_clip_embeddings = get_clip_embeddings(valid_sentences)

In [13]:
def truncate_sentence(sentence, length=5):
    new_sentences = []
    splitted = sentence.split()
    N = len(splitted)
    if  N < length:
        new_sentences.append(sentence)
        return new_sentences
    
    result = ""
    for i in range(length):
        result += splitted[i] + " "
    result = result[:-1]
    new_sentences.append(result)
                         
    rest = ""
    for i in range(length,N):
        rest += splitted[i] + " "
    rest = rest[:-1]
    other_sentences = truncate_sentence(rest)
    for os in other_sentences:
        new_sentences.append(os)
    return new_sentences

In [14]:
sentence = "Esta es una frase muy larga, puede estar en cualquier idioma y no necesita realmente tener algún sentido"
ns = truncate_sentence(sentence)
print(ns)

['Esta es una frase muy', 'larga, puede estar en cualquier', 'idioma y no necesita realmente', 'tener algún sentido']


In [26]:
def get_clip_long_embeddings(sentences, batch_size=32):
    clip_embeddings_list = []
    for sentence in sentences:
        short_sentences = truncate_sentence(sentence)
        tokenized_text = clip.tokenize(short_sentences).to(device)
        with torch.no_grad():            
            clip_embeddings_sentences = clip_model.encode_text(tokenized_text).to(device)
            clip_embeddings = torch.mean(clip_embeddings_sentences,0)
            clip_embeddings_list.append(clip_embeddings)
    final_emb = torch.stack(clip_embeddings_list)
    return final_emb

In [27]:
sentences = ["Estas todas son frases sin sentido que necesitamos para alimentar al mounstruo de CLIP", "Maybe I should speak english, so the CLIP embeddings can understand", 
             "After all, this is his preferred language", "And I shouldnt exceed the 77 tokens. LOL."]
embs_long = get_clip_long_embeddings(sentences)
print("Size: {}".format(embs_long.size()))

Size: torch.Size([4, 512])


In [28]:
embs = get_clip_embeddings(sentences)
print("Size: {}".format(embs.size()))

Size: torch.Size([4, 512])


In [35]:
import json
import os

In [53]:
def show_interior(file):
    directory = 'coco/annotations'
    f_val = open(os.path.join(directory,file))
    val_data = json.load(f_val)
    for e in val_data:
        print(e)
    images = val_data['images']
    annotations = val_data['annotations']
    print(len(images))
    print(len(annotations))
    return images, annotations

In [54]:
cap_img, cap_ann = show_interior('captions_val2017.json')

info
licenses
images
annotations
5000
25014


In [57]:
ins_img, ins_ann =show_interior('instances_val2017.json')

info
licenses
images
annotations
categories
5000
36781


In [64]:
per_img, per_ann = show_interior('person_keypoints_val2017.json')

info
licenses
images
annotations
categories
5000
11004


In [72]:
import yaml

source_dirs = ['europarl','coco']
train_sizes = [1000,2000]
number_epochs = [50]
with open("preprocessing/config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

In [73]:
for element in cfg:
    print(element)

europarl
spanish_europarl
coco
test
languages


In [None]:
for tsz in train_sizes:
    val_test_size = int(tsz/4)
    for i, train_dir in enumerate(source_dirs):
        params = cfg[train_dir]
        data_dir = params['data_dir']
        out_dir = params['out_dir']        
        try:

In [7]:
from experiment import get_sbert_embeddings, get_images_and_captions, get_sbert_embeddings, sbert_to_clip
from sentence_transformers import SentenceTransformer
import torch

In [2]:
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [3]:
languages = {'english':'en'}

In [4]:
images, captions = get_images_and_captions("crossmodal",languages)

In [5]:
torch_features = get_sbert_embeddings(captions['english'],sbert_model) 

In [8]:
sbert_features = sbert_to_clip(torch_features,'gco_NN3_1000_e300_s3127982.pt').type(torch.float16)

In [16]:
from multilingual_clip import pt_multilingual_clip
import transformers

sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
print("SBERT model loaded")
mclip_model = pt_multilingual_clip.MultilingualCLIP.from_pretrained('M-CLIP/XLM-Roberta-Large-Vit-B-32')
mclip_tokenizer = transformers.AutoTokenizer.from_pretrained('M-CLIP/XLM-Roberta-Large-Vit-B-32')

SBERT model loaded


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
sbert_model.parameters

<bound method Module.parameters of SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)>

In [18]:
mclip_model.parameters

<bound method Module.parameters of MultilingualCLIP(
  (transformer): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0): XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=