In [1]:
import json
import os
import shutil
from itertools import combinations

import numpy as np
import pandas as pd

from copy_annotations.sheet import Sheet
from src.predict_labels import CellEmbeddingModelWrapper

### Declare configs

In [2]:
OUTPUT_DIR = 'election-polls'
FILENAME = '../data/election-polls-single-sheet-comparison.xlsx'
ANNOTATION_FILE = '../data/election-polls_colombia.json'

CE_MODEL = '../models/ce.model'
FE_MODEL = '../models/fe.model'
CL_MODEL = '../models/cl.model'
W2V = '../models/glove.840B.300d.txt'
INFERSENT_MODEL = '../models/infersent1.pkl'
VOCAB_SIZE = 60000

### Preprocessing

In [3]:
annotations = {}
sheets = {}

with open(ANNOTATION_FILE) as f:
    annotations['colombia'] = json.load(f)

for sheet_name in pd.ExcelFile(FILENAME).sheet_names:
    sheet_df = pd.read_excel(FILENAME, sheet_name=sheet_name, engine='openpyxl', index_col=None, header=None)

    sheet = None
    if sheet_name in annotations:
        sheet = Sheet(sheet_df, annotations[sheet_name])
    else:
        sheet = Sheet(sheet_df)

    sheets[sheet_name] = sheet

BadZipFile: File is not a zip file

### Load model

In [4]:
model = CellEmbeddingModelWrapper(CE_MODEL, FE_MODEL, CL_MODEL, W2V, VOCAB_SIZE, INFERSENT_MODEL)

loading word vectors...
loading word embeddings...
creating dict...
embeddings loaded!


### Create embeddings

In [5]:
embeddings = model.predict_labels(FILENAME)

Generating embedding for ../data/Book1.xls and s


XLRDError: No sheet named <'s'>

### Remove unlabelled data

In [6]:
print('Started saving sheet pair data')
block_detection_output = f'../output/block_detection/{OUTPUT_DIR}'

if os.path.exists(block_detection_output):
    shutil.rmtree(block_detection_output)

sheet_pairs = combinations(embeddings.keys(), 2)
for sheet_pair in sheet_pairs:
    dir_name = os.path.join(block_detection_output, " & ".join(sheet_pair))
    os.makedirs(dir_name)

    labels = text = coordinates = cell_embeddings = []
    ids = types = roles = []

    metadata = [['sheet', 'text', 'label', 'coordinates']]
    has_annotations = False
    # if any([sheets[s].annotations for s in sheet_pair]):
    #     has_annotations = True
    #     metadata = [['sheet', 'text', 'label', 'coordinates', 'id', 'role', 'type']]

    for s in sheet_pair:
        if has_annotations:
            sheet = sheets[s]
            ids = sheet.represent_transformed_annotations('id').to_numpy()  #.reshape(shape[0] * shape[1])
            types = sheet.represent_transformed_annotations('type').to_numpy()  #.reshape(shape[0] * shape[1])
            roles = sheet.represent_transformed_annotations('role').to_numpy()  #.reshape(shape[0] * shape[1])
        for i in range(0, len(embeddings[s]['table_arrays'])):
            for j in range(0, len(embeddings[s]['table_arrays'][i])):
                value = embeddings[s]['table_arrays'][i][j]
                # Filter based on value
                # if value.replace('.', '', 1).isdigit():
                #     continue


                if value != 'None' and value != '':
                    text = value.replace('\n', '')
                else:
                    continue

                cell_embeddings.append(list(embeddings[s]['embeddings'][i][j]))
                if has_annotations:
                    metadata.append([s, text, embeddings[s]['labels'][i][j], f'|{i + 1}_{j + 1}|', ids[i][j], roles[i][j], types[i][j]])
                else:
                    metadata.append([s, text, embeddings[s]['labels'][i][j], f'|{i + 1}_{j + 1}|'])

                assert len(cell_embeddings) + 1 == len(metadata)

    print('Saving:', sheet_pair)
    np.savetxt(os.path.join(dir_name, 'embeddings.tsv'), cell_embeddings, delimiter='\t')
    np.savetxt(os.path.join(dir_name, 'metadata.tsv'), metadata, delimiter='\t', fmt='%s')

Started saving sheet pair data


NameError: name 'embeddings' is not defined