In [1]:
import torch
import time
import re
import os

In [None]:
 from europarl_preprocessing import write_new_files

In [None]:
tsz = 500000
vate_size = int(tsz/4)
write_new_files(tsz, vate_size, vate_size, max_sentence_length=50)

In [2]:
def get_number_lines(path):
    with open(path, 'r') as file:
        lines = file.readlines()
    return len(lines)        

path = os.path.join('europarl','train_sentences.txt')
coco_path = os.path.join('coco','train_sentences.txt')
europarl_train = get_number_lines(path)
coco_train = get_number_lines(coco_path)
print(europarl_train)
print(coco_train)

800000
473402


In [8]:
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"
directory = 'europarl'

def regexification(sentences):
    regex = [r"[^A-Za-z0-9]+|[a-zA-Z][0-9]", r"(?<!\d)[0]\d*(?!\d)", r"\s+", r"[0-9]+"]
    for r in regex:
        sentences = list(map(lambda sentence: re.sub(r, " ", sentence), sentences))
    return sentences

def get_files_paths(directory):
    train_txt = 'train_sentences.txt'    
    valid_txt = 'valid_sentences.txt'
    test_txt = 'test_sentences.txt'
    train_path = os.path.join(directory,train_txt)    
    valid_path = os.path.join(directory,valid_txt)
    test_path = os.path.join(directory,test_txt)
    return train_path, valid_path, test_path

def get_sentences_from_file(filename):
    sentences = []
    with open(filename, mode='rt', encoding='utf-8') as file_object:
        for line in file_object:
            sentences.append(line)    
    return sentences

def get_clip_embeddings(sentences, batch_size=32):
    tokenized_text = clip.tokenize(sentences).to(device)
    with torch.no_grad():
        clip_embeddings_list = []
        for i in range(0,tokenized_text.size()[0],batch_size):
            tok_batch = tokenized_text[i:i+batch_size]
            clip_embeddings_batch = clip_model.encode_text(tok_batch).to(device)
            for unity in clip_embeddings_batch:
                clip_embeddings_list.append(unity)
    final_emb = torch.stack(clip_embeddings_list)
    return final_emb

In [None]:
train_file, valid_file, test_file = get_files_paths(directory)
train_len = get_number_lines(train_file)
valid_len = get_number_lines(valid_file)
test_len = get_number_lines(test_file)
print([train_len, valid_len,test_len])

In [9]:
clip_model, preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()
print("done")

done


In [None]:
train_sentences = regexification(get_sentences_from_file(train_file))
valid_sentences = regexification(get_sentences_from_file(valid_file))
train_clip_embeddings = get_clip_embeddings(train_sentences)
valid_clip_embeddings = get_clip_embeddings(valid_sentences)

In [13]:
def truncate_sentence(sentence, length=5):
    new_sentences = []
    splitted = sentence.split()
    N = len(splitted)
    if  N < length:
        new_sentences.append(sentence)
        return new_sentences
    
    result = ""
    for i in range(length):
        result += splitted[i] + " "
    result = result[:-1]
    new_sentences.append(result)
                         
    rest = ""
    for i in range(length,N):
        rest += splitted[i] + " "
    rest = rest[:-1]
    other_sentences = truncate_sentence(rest)
    for os in other_sentences:
        new_sentences.append(os)
    return new_sentences

In [14]:
sentence = "Esta es una frase muy larga, puede estar en cualquier idioma y no necesita realmente tener algún sentido"
ns = truncate_sentence(sentence)
print(ns)

['Esta es una frase muy', 'larga, puede estar en cualquier', 'idioma y no necesita realmente', 'tener algún sentido']


In [26]:
def get_clip_long_embeddings(sentences, batch_size=32):
    clip_embeddings_list = []
    for sentence in sentences:
        short_sentences = truncate_sentence(sentence)
        tokenized_text = clip.tokenize(short_sentences).to(device)
        with torch.no_grad():            
            clip_embeddings_sentences = clip_model.encode_text(tokenized_text).to(device)
            clip_embeddings = torch.mean(clip_embeddings_sentences,0)
            clip_embeddings_list.append(clip_embeddings)
    final_emb = torch.stack(clip_embeddings_list)
    return final_emb

In [27]:
sentences = ["Estas todas son frases sin sentido que necesitamos para alimentar al mounstruo de CLIP", "Maybe I should speak english, so the CLIP embeddings can understand", 
             "After all, this is his preferred language", "And I shouldnt exceed the 77 tokens. LOL."]
embs_long = get_clip_long_embeddings(sentences)
print("Size: {}".format(embs_long.size()))

Size: torch.Size([4, 512])


In [28]:
embs = get_clip_embeddings(sentences)
print("Size: {}".format(embs.size()))

Size: torch.Size([4, 512])


In [35]:
import json
import os

In [53]:
def show_interior(file):
    directory = 'coco/annotations'
    f_val = open(os.path.join(directory,file))
    val_data = json.load(f_val)
    for e in val_data:
        print(e)
    images = val_data['images']
    annotations = val_data['annotations']
    print(len(images))
    print(len(annotations))
    return images, annotations

In [54]:
cap_img, cap_ann = show_interior('captions_val2017.json')

info
licenses
images
annotations
5000
25014


In [57]:
ins_img, ins_ann =show_interior('instances_val2017.json')

info
licenses
images
annotations
categories
5000
36781


In [64]:
per_img, per_ann = show_interior('person_keypoints_val2017.json')

info
licenses
images
annotations
categories
5000
11004


In [72]:
import yaml

source_dirs = ['europarl','coco']
train_sizes = [1000,2000]
number_epochs = [50]
with open("preprocessing/config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

In [73]:
for element in cfg:
    print(element)

europarl
spanish_europarl
coco
test
languages


In [None]:
for tsz in train_sizes:
    val_test_size = int(tsz/4)
    for i, train_dir in enumerate(source_dirs):
        params = cfg[train_dir]
        data_dir = params['data_dir']
        out_dir = params['out_dir']        
        try: