### Setup

In [1]:
work_dir="/scratch/cse/phd/anz198717/XC"
corpus_dataset="AmazonTitles-1.3M-dummy"
txt_model="sentencebert"
corpus_dset=f"{work_dir}/Corpus/{corpus_dataset}"

In [2]:
import site
import sys
site.addsitedir(f"{work_dir}/programs/ExtremeMethods")
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import xc.tools.build_from_msr as msr

## Only for internal datasets

In [3]:
# args=f"--in_dir {corpus_dset} --ot_dir {corpus_dset} \
#     --docs_input corpus_data.txt --lbls_input corpus_x_y.txt"
# sys.argv = f"TOKEN {args}".split()
# print(args)
# args = msr.setup()
# lines = msr.build_docs(args)
# msr.build_lbls(args, lines)

In [4]:
data_dir=f"{corpus_dset}/temp"
img_path=f"{corpus_dset}/img.bin"
tst_map=f"{corpus_dset}/test_map.txt"
lbl_map=f"{corpus_dset}/label_map.txt"

## Tokenize and Build data

In [5]:
import xc.tools.tokenize_text as token
from tqdm.notebook import tqdm
from xc.libs.utils import pbar
import scipy.sparse as sp
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
sys.argv = f"XCSAGE".split()

from xc.libs.data_img import IMGBINDataset, read_raw_img_bin
from xc.libs.custom_dtypes import FeaturesAccumulator
from xc.libs.dataparallel import DataParallel
from xc.models.models_img import Model
import argparse
import torch

parser = argparse.ArgumentParser(description='Pretrained models')
args = parser.parse_args()
args.project_dim = -1
args.corpus_dir = corpus_dset
args.batch_size = 256


def tokens(text_map, _tokenizer, max_len=32):
    text = list(map(lambda x: x.strip().split("->", 1)[1],
                    pbar(open(text_map, "r", encoding="latin1"), desc="docs")))
    text = _tokenizer(text, truncation=True, padding=True,
                      max_length=max_len, add_special_tokens=True,)
    input_idx = np.asarray(text.input_ids, dtype=np.int32)
    attention = np.asarray(text.attention_mask, dtype=np.int32)
    max_vocab = _tokenizer.vocab_size
    _tokens = np.stack([input_idx, attention], axis=1)
    return _tokens, max_vocab


def read_ptrs(file):
    ptrs = {}
    curr = 0
    if os.path.exists(file):
        with open(file, "rb") as f:
            for line in pbar(f):
                uid, _ = line.split(b"\t", 1)
                uid = uid.decode('utf-8')
                _ptrs = ptrs.get(uid, [])
                _ptrs.append(curr)
                ptrs[uid] = _ptrs
                curr = f.tell()
    return ptrs

def build_sparse_mat(doc_map, dict_ptrs):
    uids = list(map(lambda x: x.split("->", 1)[0], pbar(open(doc_map,"r", encoding="latin1"))))
    ptrs, cols, rows, num_cols, num_rows = [], [], [], 0, 0
    for row, uid in pbar(enumerate(uids), desc="buildling"):
        uid = uid.split(",")
        # NOTE offesting it with 1 for sparse matrix
        sub_ptrs = np.concatenate(
            list(map(lambda x: dict_ptrs.get(x, [-1]), uid))) + 1
        ptrs.append(sub_ptrs)
        cols.append(np.arange(sub_ptrs.size) + num_cols)
        rows.append(np.ones(sub_ptrs.size)*row)
        num_cols += sub_ptrs.size
    image_mat = sp.lil_matrix((len(uids), num_cols))
    rows = np.concatenate(rows)
    cols = np.concatenate(cols)
    ptrs = np.concatenate(ptrs)
    image_mat[rows, cols] = ptrs
    image_mat = image_mat.tocsr()
    return image_mat

def save(data_path, file_name, img, txt):
    suffix=os.path.join(data_path, file_name)
    if img.nnz >0:
        sp.save_npz(f"{suffix}.img.bin.npz", img)
    txt, max_vocab = txt
    data = np.memmap(f"{suffix}.txt.seq.memmap.dat", dtype=np.int32, mode="w+", shape=txt.shape)
    data[:] = txt
    data.flush()
    inst, channel, length = txt.shape
    with open(f"{suffix}.txt.seq.memmap.meta", "w") as f:
        f.write(f"{inst},{channel},{length},{max_vocab}\n")

## Fetching raw vect

In [6]:
def collate_fn(batch):
    imgs = torch.cat(list(map(lambda x: x.get_raw_vect(), batch)), dim=0)
    return imgs


def get_pre_trained(model, img_file, params):
    pre_trained = Model(model, params)
    pre_trained = DataParallel(pre_trained)
    dataset = IMGBINDataset(params.corpus_dir, img_file, "img.bin", random_k=-1)
    # dataset.read_func = read_raw_img_bin
    dl = torch.utils.data.DataLoader(
        dataset, batch_size=params.batch_size, collate_fn=collate_fn,
        shuffle=False, num_workers=6, prefetch_factor=2)
    features = FeaturesAccumulator("Image features", "memmap", ".img.vect")
    mask = None
    with torch.no_grad():
        pre_trained = pre_trained.cuda()
        pre_trained = pre_trained.eval()
        start = 0
        with torch.cuda.amp.autocast():
            for idx, data in enumerate(tqdm(dl)):
                embs, mask = pre_trained(data)
                features.transform(embs, mask)
    features.compile()
    features.remap(dataset.data)
    return features

In [7]:
_tokenizer = token.AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v4", do_lower_case=True)
dict_ptrs = read_ptrs(img_path)

tst_txt = tokens(tst_map, _tokenizer)
lbl_txt = tokens(lbl_map, _tokenizer)

tst_img = build_sparse_mat(tst_map, dict_ptrs)
lbl_img = build_sparse_mat(lbl_map, dict_ptrs)


os.makedirs(f"{data_dir}", exist_ok=True)
save(f"{data_dir}", "test", tst_img, tst_txt)
save(f"{data_dir}", "label", lbl_img, lbl_txt)

docs: 970237it [00:00, 1087165.72it/s]
docs: 1305265it [00:01, 1074220.80it/s]
970237it [00:00, 1296265.58it/s]
buildling: 970237it [00:15, 64480.39it/s]
1305265it [00:00, 1336204.46it/s]
buildling: 1305265it [00:20, 63698.07it/s]


In [8]:
trn_txt = tokens(f"{corpus_dset}/train_map.txt", _tokenizer)
trn_img = build_sparse_mat(f"{corpus_dset}/train_map.txt", dict_ptrs)
save(f"{data_dir}", "train", trn_img, trn_txt)

docs: 2248619it [00:02, 1096099.53it/s]
2248619it [00:01, 1351008.50it/s]
buildling: 2248619it [00:34, 64649.83it/s]


In [None]:
tst_vect = get_pre_trained("ViT", f"{data_dir}/test.img.bin.npz", args)
tst_vect.save(os.path.join(data_dir, "ViT/test"))

lbl_vect = get_pre_trained("ViT", f"{data_dir}/label.img.bin.npz", args)
lbl_vect.save(os.path.join(data_dir, "ViT/label"))