In [1]:
import json

import numpy as np
import pandas as pd

from extract_features import main
from copy_annotations.sheet import Sheet

### Declare configs

In [2]:
FILENAME = '../data/election-polls.xlsx'
SHEET_NAME = 'colombia'
ANNOTATION_FILE = '../data/election-polls_colombia.json'

# FILENAME = '../data/data.xlsx'
# SHEET_NAME = 'india_wheat'
# ANNOTATION_FILE = '../data/source_india_wheat.json'

TRANSFORMED = '../data/temp.xlsx'
CE_MODEL = '../models/ce.model'
FE_MODEL = '../models/fe.model'
CL_MODEL = '../models/cl.model'
W2V = '../models/glove.840B.300d.txt'
INFERSENT_MODEL = '../models/infersent1.pkl'
VOCAB_SIZE = 60000

OUT = '../output/output.json'

### Preprocessing

In [3]:
with open(ANNOTATION_FILE) as f:
    annotations = json.load(f)

sheet_df = pd.read_excel(FILENAME, sheet_name=SHEET_NAME, engine='openpyxl', index_col=None, header=None)
sheet = Sheet(sheet_df, annotations)
transformed_annotations = sheet.transformed_annotations
sheet.transformed_df.to_excel(TRANSFORMED, sheet_name=SHEET_NAME, header=False, index=False)

### Create embeddings

In [4]:
embeddings = main(TRANSFORMED, SHEET_NAME, CE_MODEL, FE_MODEL, CL_MODEL, W2V, VOCAB_SIZE, INFERSENT_MODEL)

shape = embeddings.shape
re_emb = embeddings.reshape(shape[0] * shape[1], shape[2])

loading word vectors...
loading word embeddings...
creating dict...
embeddings loaded!
initialize 40 text sentences...


100%|██████████| 40/40 [00:01<00:00, 23.38it/s]


initialize 0 numeric sentences...


0it [00:00, ?it/s]


torch.Size([16, 9, 552])


### Remove unlabelled data

In [5]:
shape = sheet.transformed_df.shape
ids = sheet.represent_transformed_annotations('id').to_numpy().reshape(shape[0] * shape[1])
types = sheet.represent_transformed_annotations('type').to_numpy().reshape(shape[0] * shape[1])
roles = sheet.represent_transformed_annotations('role').to_numpy().reshape(shape[0] * shape[1])
values = sheet.transformed_df.to_numpy().reshape(shape[0] * shape[1])

In [6]:
cell_embeddings = []
metadata = [['id', 'role', 'type', 'value']]

for i in range(0, len(ids)):
    if ids[i] != 'UNLABELED':
        cell_embeddings.append(list(re_emb[i]))
        metadata.append([ids[i], roles[i], types[i], values[i]])

np.savetxt('embeddings.tsv', cell_embeddings, delimiter='\t')
b = np.savetxt('metadata.tsv', metadata, delimiter='\t', fmt='%s')
