In [1]:
import pandas as pd
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import glob
import spacy
import plotly.express as px

#load core spanish library
nlp = spacy.load("es_core_news_sm")

## Listo los documentos que conforman mi test set

In [2]:
# cargo test_set_files
mypath = '/Users/data/Documents/actas/'

with open(mypath + 'test_set_files.txt') as json_file:
    test_set_files = json.load(json_file)

In [57]:
# veo cuáles son los documentos únicos
test_files_list = list(test_set_files.values())

In [58]:
test_files_list = [y for x in test_files_list for y in x]

# me quedo sólo con el doc id
test_files_list = [t.split('.', 1)[0] for t in test_files_list]
test_files_list = [t.split('-', 1)[0] for t in test_files_list]

# me quedo con los únicos
test_files_list = list(set(test_files_list))

test_files_list.sort()
test_files_list

['12264', '12863', '13874', '13881', '16560', '17174', '17513', '22264']

## Listo los documentos que conforman el train set

In [28]:
# cargo train_set_files
mypath = '/Users/data/Documents/actas/'

with open(mypath + 'train_set_files.txt') as json_file:
    train_set_files = json.load(json_file)
    
train_set_files = list(train_set_files.values())

train_set_files = [y for x in train_set_files for y in x]

# me quedo sólo con el doc id
train_set_files = [t.split('.', 1)[0] for t in train_set_files]
train_set_files = [t.split('-', 1)[0] for t in train_set_files]

# me quedo con los únicos
train_set_files = list(set(train_set_files))

#train_set_files

['20831',
 '12863',
 '14443',
 '26120',
 '25207',
 '12192',
 '17403',
 '26228',
 '12264',
 '17513',
 '20516',
 '22307',
 '14025',
 '13073',
 '23865',
 '20733',
 '10227',
 '10900',
 '22217',
 '20878',
 '22264',
 '12853',
 '25301',
 '17200',
 '13061',
 '17202',
 '25424',
 '23897',
 '24781',
 '22010',
 '17174',
 '26093',
 '13697',
 '10329',
 '23820',
 '16560',
 '12809']

## Veo si por casualidad hubo overlap

In [29]:
intersection_set = set(train_set_files).intersection(set(test_set_files))
intersection_set

set()

## Los documentos del test set los corro enteros
Empiezo con un documento solo

In [49]:
# esto lo tengo que dejar en un módulo!
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [53]:
model_list = ['distiluse-base-multilingual-cased-v1',
            'stsb-xlm-r-multilingual',
            'quora-distilbert-multilingual',
            'paraphrase-xlm-r-multilingual-v1',
            'paraphrase-multilingual-MiniLM-L12-v2',
            'paraphrase-multilingual-mpnet-base-v2']

In [63]:
j=0
for m in model_list:
    print('model', j, 'of', len(model_list), ' (', m, ')')
   # m = 'distiluse-base-multilingual-cased-v1'
    model = SentenceTransformer(m)
    model.max_seq_length = 512
    
    i=0
    for doc_id in test_files_list:
        #doc_id = test_files_list[0]
        print('\tfile', i, 'of', len(test_files_list))

        # cargo todas las páginas de ese documento
        file_list = glob.glob(mypath + "ocr/" + doc_id + '/*')
        file_list.sort()

        # a cada una le calculo el embedding

        # este diccionario tiene, para cada página, su embedding
        embedding_dict = {}

        for f in file_list:

            # me quedo con el identificador de la página
            file_id = f.split('/')[-1].split('.')[0]
            #print(file_id)

            # estaría bueno encapsular esto en una función
            with open(f) as json_file:
                pagina = json.load(json_file)

            # armo la pagina
            full_page = []
            for b in pagina['Blocks']:
                if b['BlockType'] == 'LINE':
                    full_page.append(b['Text'])
                else:
                    continue

            full_page = ' '.join(full_page)

            doc = nlp(full_page)

            # me quedo con el primer 0.5 de las oraciones
            sentences_list = list(doc.sents)
            n_keep = int(0.5*len(sentences_list))
            text = ''.join(str(s) for s in sentences_list[:n_keep])

            # calculo el embedding
            embedding = model.encode(str(text))

            embedding_dict[file_id] = embedding

        # exporto
        dumped = json.dumps(embedding_dict, cls=NumpyEncoder)
        with open(mypath + 'embedding_results/test_full_doc/embeddings_' + doc_id + '_' + m + '.txt', 'w') as file:
            file.write(dumped)
        i=i+1
    j=j+1

model 0 of 6  ( distiluse-base-multilingual-cased-v1 )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8
model 1 of 6  ( stsb-xlm-r-multilingual )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8
model 2 of 6  ( quora-distilbert-multilingual )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8
model 3 of 6  ( paraphrase-xlm-r-multilingual-v1 )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8
model 4 of 6  ( paraphrase-multilingual-MiniLM-L12-v2 )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8
model 5 of 6  ( paraphrase-multilingual-mpnet-base-v2 )
	file 0 of 8
	file 1 of 8
	file 2 of 8
	file 3 of 8
	file 4 of 8
	file 5 of 8
	file 6 of 8
	file 7 of 8


## Evaluación
Empiezo con un solo documento y un modelo.