In [18]:
import numpy as np
from sklearn.cluster import  KMeans
from sklearn.manifold import TSNE
import torch
import argparse
import json
from tqdm import tqdm
import pandas as pd
from FlagEmbedding import FlagModel
import time
import submodlib
from submodlib.functions.facilityLocation import FacilityLocationFunction
from datasets import Dataset

In [27]:
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pt_data_path", type=str, required=True)
    parser.add_argument("--json_data_path", type=str, required=True)
    parser.add_argument("--json_save_path", type=str, required=True)
    parser.add_argument("--sent_type", type=int, default=0)
    parser.add_argument("--ppl_type", type=int, default=0)
    parser.add_argument("--cluster_method", type=str, default='kmeans')
    parser.add_argument("--reduce_method", type=str, default='tsne')
    parser.add_argument("--sample_num", type=int, default=10)
    parser.add_argument("--kmeans_num_clusters", type=int, default=100)
    parser.add_argument("--low_th", type=int, default=1)
    parser.add_argument("--up_th", type=int, default=99)

    args = parser.parse_args()
    return args

def do_clustering(high_dim_vectors,cluster_method='kmeans',kmeans_num_clusters=100):

    clustering_algorithm = cluster_method
    if clustering_algorithm == 'kmeans':
        clustering = KMeans(n_clusters=kmeans_num_clusters, random_state=0).fit(high_dim_vectors)
    
    return clustering

def do_reduce_dim(high_dim_vectors):
    # Perform t-SNE for visualization
    # if args.reduce_method == 'tsne':
    tsne = TSNE(n_components=2, random_state=0)
    low_dim_vectors = tsne.fit_transform(high_dim_vectors)
    return low_dim_vectors

def sample_middle_confidence_data(cluster_labels, confidences, n, low_th=25, up_th=75):
    num_clusters = len(np.unique(cluster_labels))

    # Get the indices for each cluster
    cluster_indices = {i: np.where(cluster_labels == i)[0] for i in range(num_clusters)}
    
    # Create a dictionary to store the indices of the middle level confidence samples
    middle_confidence_samples = {}

    for i in range(num_clusters):
        # Get the sorted indices for this cluster
        sorted_indices = cluster_indices[i]
        
        # If there are less than n samples in this class, just return all of them
        if len(sorted_indices) < n:
            middle_confidence_samples[i] = sorted_indices
            continue

        # Get the confidences for this cluster
        cluster_confidences = confidences[sorted_indices]
        lower_threshold = np.percentile(cluster_confidences, low_th)
        upper_threshold = np.percentile(cluster_confidences, up_th)

        # Get the indices of the samples within the middle level confidence range
        middle_indices = sorted_indices[(cluster_confidences >= lower_threshold) & (cluster_confidences <= upper_threshold)]
        
        # If there are less than n samples in the middle range, use all of them
        if len(middle_indices) < n:
            middle_confidence_samples[i] = middle_indices
        else:
            # Calculate step size for even sampling
            step_size = len(middle_indices) // n
            # Select evenly from the middle level confidence samples
            middle_confidence_samples[i] = middle_indices[::step_size][:n]

    return middle_confidence_samples

### Get Embeddings by SentBert Model

### Parse LLM File

In [20]:
All_Data = pd.read_json('/data/home/wangys/transfer-er/Pipeline/Amazon-Google/LLM_file/Amazon-Google-Train-Match-P1.json')

In [9]:
def cut_input_length(row):
    input = row['instruction']
    output = input.split('at the final judgement.')[1].split('Take these examples as reference:')[0]
    return output
All_Data['instruction'] = All_Data.apply(cut_input_length,axis=1)

In [21]:
# text = All_Data.iloc[0,0]
left_list = []
right_list = []
label_list = []
for index,row in All_Data.iterrows():
    text = row[0]
    Entity_1 = text.split('\n\nEntity 1:')[1].split('\n\nEntity 2')[0]
    Entity_2 = text.split('\n\nEntity 2:')[1].split('\n\nTake these examples as reference:')[0] 
    label = eval(row[-1])['Output']
    left_list.append(Entity_1)
    right_list.append(Entity_2)
    label_list.append(label)

  text = row[0]
  label = eval(row[-1])['Output']


In [22]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='7'
model = FlagModel('../sentence_transformer_model/bge-large-en-1.5/', 
                  use_fp16=True)
embedding_a = model.encode(left_list)
embedding_b = model.encode(right_list)
embedding_c = model.encode(label_list)

Inference Embeddings: 100%|██████████| 15/15 [00:02<00:00,  6.33it/s]
Inference Embeddings: 100%|██████████| 15/15 [00:02<00:00,  7.11it/s]
Inference Embeddings: 100%|██████████| 15/15 [00:00<00:00, 38.28it/s]


In [48]:
# ppl = pd.read_json('/data/home/wangys/DataSelection-IF/ppl/ppl_qwen2.5-7B-AG-Short.json')
ppl = pd.read_csv('ppl/ppl_qwen2.5-0.5B-AG-short.csv',index_col=0)

In [49]:
ppl

Unnamed: 0,0
0,3.828125
1,5.406250
2,5.375000
3,4.781250
4,4.281250
...,...
3658,4.281250
3659,4.562500
3660,6.031250
3661,4.437500


In [50]:
import numpy as np
ppl_list = ppl.iloc[:,0].to_list()
pt_data = np.concatenate([embedding_a,embedding_b,embedding_c],axis=1)
pt_data.shape

(3663, 3072)

In [51]:
# args = parse_args()
# print(args)

# pt_data = torch.load(args.pt_data_path, map_location=torch.device('cpu'))
# with open(args.json_data_path, "r") as f:
#     json_data = json.load(f)
file_path = '/data/home/wangys/transfer-er/Pipeline/Amazon-Google/LLM_file/Amazon-Google-Train-Match-P1-wo-RAG.json'
json_data = pd.read_json(file_path)

# emb_list = []
# ppl_list = []
# for i in tqdm(range(len(json_data))):
#     sent_emb_list = pt_data[i]
#     # sent_emb_list = data_i['sent_emb']
#     emb_list.append(sent_emb_list)
#     ppl_list.append(ppl_list[i])
high_dim_vectors = pt_data

# high_dim_vectors = torch.cat(emb_list,0).numpy()
ppl_array = np.array(ppl_list)

clustering = do_clustering(high_dim_vectors,kmeans_num_clusters=100)
cluster_labels = clustering.labels_

def get_json_sample(middle_confidence_samples):
    
    json_samples = []
    for k in middle_confidence_samples.keys():
        ids_list = middle_confidence_samples[k].tolist()
        # for id_i in ids_list:
            # ori_sample = json_data[id_i]
        json_samples.extend(ids_list)
    

    return json_samples

# middle_confidence_samples = sample_middle_confidence_data(cluster_labels, ppl_array, n = 10, low_th=25, up_th = 75)
middle_confidence_samples = sample_middle_confidence_data(cluster_labels, ppl_array, n = 10, low_th=25, up_th = 75)

new_data = get_json_sample(middle_confidence_samples)
json_data.iloc[new_data]['output'].value_counts()
# print('New data len \n',len(new_data))
# with open(args.json_save_path, "w") as fw:
#     json.dump(new_data, fw, indent=4)
# pass


output
{'Output': 'mismatch'}    741
{'Output': 'match'}       250
Name: count, dtype: int64

In [52]:
json_data.iloc[new_data]

Unnamed: 0,instruction,input,output
299,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
540,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
1010,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
1283,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
1707,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
...,...,...,...
2238,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
2538,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
2665,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}
3258,You are an expert in detecting if two text des...,,{'Output': 'mismatch'}


In [36]:
json.dump(json_data.iloc[new_data].to_dict(orient='records'), open('train/AG-train-init.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=4)

In [99]:
def do_fla(X, number_all, number_select):
    start_time = time.time()

    Y = X
    obj = FacilityLocationFunction(n=number_all, mode="dense", data=Y, metric="cosine")
    greedyList = obj.maximize(budget=number_select, optimizer='LazyGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, verbose=False)
    idx_list = [tuple_i[0] for tuple_i in greedyList]

    print('FLA time used:',(time.time()-start_time),'(second)')
    return idx_list,greedyList

idx_list,greedyList = do_fla(high_dim_vectors,high_dim_vectors.shape[0],number_select=1000)

FLA time used: 52.16716647148132 (second)


[||||||||||||||||||||]100% [Iteration 1000 of 1000]

In [104]:
cluster_indices = {i: np.where(cluster_labels == i)[0] for i in range(100)}

In [109]:
# cluster_indices[0]
idx_list,greedyList = do_fla(high_dim_vectors[cluster_indices[1]],high_dim_vectors[cluster_indices[1]].shape[0],number_select=32)

FLA time used: 0.13275599479675293 (second)


[||||||||||||||||||||]100% [Iteration 32 of 32] ]71% [Iteration 23 of 32]

In [113]:
high_dim_vectors[cluster_indices[1]].shape[0]


114

### IDF Score Calculation

In [12]:
condition_score = pd.read_json('ppl/ppl_qwen2.5-7B-AG-Short.json').iloc[:,0].to_list()
full_score = pd.read_json('ppl/ppl_qwen2.5-7B-AG-Full.json').iloc[:,0].to_list()


In [17]:
IDF = np.array(full_score) / np.array(condition_score)
np.sum(IDF < 1)

39