# Similarity Generation

In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.model_selection import train_test_split
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

from datasets import load_dataset
imdb = load_dataset("imdb")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)
 
# tokenized_train = imdb["train"].shuffle(seed=42).map(preprocess_function, batched=True)
shuffled_dataset = imdb["train"].shuffle(seed=42)
split_index = int(0.7 * len(shuffled_dataset))
# Select the first 80% as the training set
train_dataset = shuffled_dataset.select(range(split_index))
# Select the remaining 20% as the validation set
val_dataset = shuffled_dataset.select(range(split_index, len(shuffled_dataset)))
# Split into training and validation sets (e.g., 80% train, 20% validation)
# Tokenize both the training and validation datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
test_dataset = imdb["test"].shuffle(seed=42).map(preprocess_function, batched=True)

Map:   0%|          | 0/17500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [2]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

embeddings = model.encode(train_dataset['text'], convert_to_tensor=True)
similarity = model.similarity(embeddings,embeddings)
similarity =similarity.detach().cpu().numpy()

In [3]:
import numpy as np
from numba import njit,prange
@njit(fastmath=True,parallel=True)
def QS(similarity, costs, delta, budget):
    """
    Maximizes the ground set based on similarity and cost constraints.

    Args:
        similarity (np.ndarray): NxN matrix representing pairwise similarity scores between text elements.
        costs (np.ndarray): 1D array representing the cost associated with each text element.
        delta (float): A constant to regulate the minimum gain to cost ratio.
        budget (float): The available budget for selecting elements.

    Returns:
        np.ndarray: A binary array indicating which elements are selected in the ground set (1 if selected, 0 if not).
    """
    
    # Number of elements in the set
    N = len(similarity)
    print('Size of unpruned ground set',N)
    
    # Current objective value
    curr_obj = 0
    
    # Maximum similarity values for each element
    max_similarity = np.zeros(N)
    
    # Ground set to keep track of selected elements (0: not selected, 1: selected)
    ground_set = np.zeros(N)
    
    # Loop through all elements to consider them for the ground set
    for element in range(N):
        obj_val = 0
        
        # Calculate the objective value by updating the maximum similarity
        for i in prange(N):
            obj_val += max(max_similarity[i], similarity[i, element])

        # Gain is the increase in the objective value
        gain = obj_val - curr_obj
        
        # Check if the gain-to-cost ratio meets the threshold based on delta and budget
        if gain / costs[element] >= delta / budget * curr_obj:
            # Update the current objective value with the gain
            curr_obj += gain
            
            # Mark the element as selected in the ground set
            ground_set[element] = 1
            for i in range(N):
                max_similarity[i] = max(max_similarity[i], similarity[i, element])

    print('Size of pruned ground set',ground_set.sum())
    return ground_set

costs = np.ones(len(similarity))
delta = 0.1
budget = 1000
pruned_ground_set=QS(similarity, costs, delta, budget)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Size of unpruned ground set 17500
Size of pruned ground set 1355.0


In [4]:
from numba import njit,prange

@njit(fastmath=True,parallel=True)
def facility_location(similarity,costs,budget,ground_set):
    # N= 25000
    N = len(similarity)

    max_obj = 0
    total_cost = 0
    solution_sparse = np.zeros(N)

    max_similarity = np.zeros(N)

    while total_cost < budget:

        max_element = -1
        obj_val = np.zeros(N)

        for element in prange(N):
            if solution_sparse[element] == 0 and ground_set[element] ==1 and costs[element]+total_cost <=budget:


                for i in range(N):
                    obj_val[element] += max(max_similarity[i],similarity[i,element])
         

        max_element = np.argmax(obj_val)

        if obj_val[max_element] == max_obj:
            break

        else:
            solution_sparse[max_element] = 1
            total_cost += costs[max_element]
            for i in range(N):
                max_similarity[i] = max(max_similarity[i],similarity[i,max_element])
            
            max_obj = obj_val[max_element]

    print(max_obj)
    print(solution_sparse.sum())
    return max_obj,solution_sparse


N = len(similarity)
costs = np.ones(N)
budget = 1000
obj_val_pruned,solution_pruned = facility_location(costs=costs,budget=budget,similarity=similarity,ground_set=pruned_ground_set)
        

mask_pruned = np.where(solution_pruned==1)[0]
obj_val_unpruned,solution_unpruned = facility_location(costs=costs,budget=budget,similarity=similarity,ground_set=np.ones(N))

mask_unpruned= np.where(solution_unpruned ==1)[0] 


11270.472258493304
1000.0
11445.031223535538
1000.0


In [5]:
import evaluate
accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 17500
})

In [9]:
training_args = TrainingArguments(
    output_dir="text_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(mask_pruned),
    eval_dataset= tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train() 

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.44051888585090637, 'eval_accuracy': 0.8170666666666667, 'eval_runtime': 63.7897, 'eval_samples_per_second': 117.574, 'eval_steps_per_second': 7.352, 'epoch': 1.0}


  0%|          | 0/469 [00:00<?, ?it/s]

{'eval_loss': 0.34956881403923035, 'eval_accuracy': 0.8641333333333333, 'eval_runtime': 66.7428, 'eval_samples_per_second': 112.372, 'eval_steps_per_second': 7.027, 'epoch': 2.0}
{'train_runtime': 184.442, 'train_samples_per_second': 10.844, 'train_steps_per_second': 0.683, 'train_loss': 0.4089543176075769, 'epoch': 2.0}


TrainOutput(global_step=126, training_loss=0.4089543176075769, metrics={'train_runtime': 184.442, 'train_samples_per_second': 10.844, 'train_steps_per_second': 0.683, 'total_flos': 260935937715072.0, 'train_loss': 0.4089543176075769, 'epoch': 2.0})

In [15]:
trainer.eval_dataset = test_dataset

In [16]:
trainer.evaluate()['eval_accuracy']

  0%|          | 0/1563 [00:00<?, ?it/s]

0.86104

<transformers.trainer.Trainer at 0x7ffa1c8db640>

In [None]:
training_args = TrainingArguments(
    output_dir="text_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=25,
    per_device_eval_batch_size=25,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train() 

In [19]:
import pickle
def load_from_pickle(file_path):
    """
    Load data from a pickle file.

    Parameters:
    - file_path: The path to the pickle file.

    Returns:
    - loaded_data: The loaded data.
    """
    with open(file_path, 'rb') as file:
        loaded_data = pickle.load(file)
    print(f'Data has been loaded from {file_path}')
    return loaded_data

In [20]:
df = load_from_pickle('IMDB')

Data has been loaded from IMDB


Unnamed: 0,Ratio,Budget,Training Set Size,Pruned Ground Set Size (FS from QS),Accuracy (FS + QS),Pruned Training Set Size (FS),Accuracy (FS),Accuracy
0,0.01,175.0,17500,224.0,0.5,175,0.5,0.9142


In [None]:
   Ratio  Budget  Training Set Size  ...  Accuracy (FS + QS)  Pruned Training Set Size (FS)  Accuracy (FS)
0   0.01   175.0              17500  ...             0.60652                            175        0.55408
1   0.05   875.0              17500  ...             0.86124                            875        0.86124
2   0.10  1750.0              17500  ...             0.87268                           1750        0.87268