# Create new training `.npz` for LF-ORCAS-800K

In [None]:
import os
import scipy.sparse as sp
import numpy as np
import pickle
import json
from tqdm import tqdm, trange

import scipy.sparse as sp

In [None]:
# set the path to ORCAS slm generations rewrites file, trn embeddings, rewrite embeddings
dataset_path = "../../artifacts/step-1/slm-large-scale-inference/orcas"

# path to the original biased dataset
original_dataset_path = "../../datasets/orcas"

In [None]:
!ls {dataset_path}

In [None]:
# This file maps each rewrite/synthetic query index to its respective label for which it was generated for
start_indices = np.load(f"{dataset_path}/start_indices.npy")
start_indices.shape

In [None]:
start_indices

In [None]:
nbr_query_indices = np.load(f"{dataset_path}/ngbr_indices_mat.npy")
nbr_query_scores = np.load(f"{dataset_path}/ngbr_scores_mat.npy")
nbr_query_indices.shape, nbr_query_scores.shape

In [None]:
# normalize the score from [-1, 1] to [0, 1]
nbr_query_scores = (nbr_query_scores + 1) / 2

In [None]:
nbr_query_scores.min(), nbr_query_scores.max(), nbr_query_scores.mean()

In [None]:
# Load the original biased trn_X_Y
trn_X_Y = sp.load_npz(os.path.join(original_dataset_path, "trn_X_Y.npz"))
trn_X_Y.shape

In [None]:
# load the generated synthetic queries/rewrites
rewrites = [x.strip() for x in open(os.path.join(dataset_path, "raw_rewrites.txt"))]

In [None]:
rewrites[:5]

In [None]:
trn_doc = [x.strip() for x in open(os.path.join(original_dataset_path, "raw/trn_X.txt"), "r").readlines()]
lbl = [x.strip() for x in open(os.path.join(original_dataset_path, "raw/Y.txt"), "r").readlines()]

In [None]:
trn_doc[:5], lbl[:5]

In [None]:
top_k = 10 # how many ngbrs to consider
threshold = 0.8 # similarity threshold to filter noisy ngbrs
# NOTE: you may tweak above values depending on your dataset! However, in our experiments, these values worked well :)

## Now create the npz for the new dataset

In [None]:
pos_pairs = list()
pos_scores = list()

In [None]:
for i in trange(start_indices.shape[0] - 1):
    start_idx = start_indices[i]
    end_idx = start_indices[i+1]
    
    query_indices = nbr_query_indices[start_idx : end_idx, :top_k].ravel()
    query_scores = nbr_query_scores[start_idx : end_idx, :top_k].ravel()
    mask = query_scores >= threshold
    query_indices = query_indices[mask]
    query_scores = query_scores[mask]
    
    if query_indices.shape[0] == 0:
        continue
        
    perm = np.argsort(query_indices)
    sort = query_indices[perm]
    mask = np.concatenate([[True], sort[1:] != sort[:-1]])
    query_indices = sort[mask]
    query_scores = np.maximum.reduceat(query_scores[perm], mask.nonzero()[0])
    
    cur_label_array = np.ones((query_indices.shape[0],), dtype=np.int32) * i
    cur_pairs = np.vstack((query_indices, cur_label_array)).T
    pos_pairs.append(cur_pairs)
    pos_scores.append(query_scores)

In [None]:
pos_pairs = np.concatenate(pos_pairs)
pos_scores = np.concatenate(pos_scores)

pos_pairs.shape, pos_scores.shape

In [None]:
synthetic_X_Y = sp.csr_matrix((pos_scores, (pos_pairs[:, 0], pos_pairs[:, 1])), shape=trn_X_Y.shape)

In [None]:
trn_and_synthetic_X_Y = synthetic_X_Y + trn_X_Y # combine click and synthetic data

In [None]:
trn_and_synthetic_X_Y = trn_and_synthetic_X_Y.tocoo()
trn_and_synthetic_X_Y.data = np.clip(trn_and_synthetic_X_Y.data, 0, 1) # clip the .data values to 0 and 1
trn_and_synthetic_X_Y = trn_and_synthetic_X_Y.tocsr()

In [None]:
# save the synthetic + click data training matrix
sp.save_npz(os.path.join(dataset_path, "trn_and_synthetic_X_Y.npz"), trn_and_synthetic_X_Y)

# Now train your favourite XC model using this new trn_and_synthetic_X_Y.npz training matrix