In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [2]:
df = pd.read_json('/home/jovyan/medvedev/NeMo-Skills/nemo_skills/augmented_problems2.jsonl', lines=True)
df_orginal_problems = pd.read_json('/home/jovyan/medvedev/foundation_model/foundation_model/data/preprocessed/train/planner_train_small_onemessage.jsonl',  lines=True)

In [3]:
df.columns

Index(['original_problem', 'augmented_problem'], dtype='object')

In [4]:
df_orginal_problems.columns

Index(['Unnamed: 0', 'instruction', 'generation', 'response', 'feedback',
       'score', 'model_output', 'task', 'conditions', 'problem'],
      dtype='object')

In [5]:
# Load the sentence transformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Step 1: Merge original and augmented datasets
# Assume `original_id` is a unique identifier for each original problem if available
merged_df = pd.merge(df, df_orginal_problems, left_on='original_problem', right_on='problem', how='inner')

# Step 2: Concatenate 'problem' and 'augmented_problem' columns into a single column for deduplication
merged_df = pd.concat(
    [merged_df[['instruction', 'generation', 'response', 'feedback', 'score', 'model_output', 'task', 'conditions', 'problem']],
    merged_df[['augmented_problem']].rename(columns={'augmented_problem': 'problem'})],
    ignore_index=True
)

# Step 3: Deduplicate based on exact matches
dedup_df = merged_df.drop_duplicates(subset=['problem']).reset_index(drop=True)

# Step 4: Deduplicate based on cosine similarity
# Compute embeddings for cosine similarity deduplication



In [6]:
embeddings = model.encode(dedup_df['problem'].tolist(), convert_to_tensor=True, show_progress_bar=True, batch_size=1024)

Batches:   0%|          | 0/774 [00:00<?, ?it/s]

In [13]:
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch

In [53]:
batch_size = 128
duplicated_indexes = set()
first_occurrences = set()
similarity_threshold = 0.95

In [63]:
def find_duplicates_cosine(embeddings: torch.Tensor, batch_size: int = 128, similarity_threshold: float = 0.91):
    duplicated_indexes = set()
    first_occurrences = set()
    
    for i in tqdm(range(0, embeddings.size(0), batch_size)):
        # Get batch embeddings for comparisons
        batch_embeddings = embeddings[i:i + batch_size]

        # Calculate cosine similarities between batch and all embeddings
        cosine_scores = util.cos_sim(batch_embeddings, embeddings)

        # Identify duplicates in each batch
        for batch_idx in range(batch_embeddings.size(0)):
            global_idx = i + batch_idx  # Get global index for the current row

            # Skip if the current index is already marked as a duplicate
            if global_idx in duplicated_indexes:
                continue

            # Find top similarity scores for current row in batch
            topk = torch.topk(cosine_scores[batch_idx], k=2)

            # Check if the highest similarity (excluding self) is above the threshold
            if topk.values[1].item() >= similarity_threshold:
                # Get the index of the most similar row
                duplicate_idx = topk.indices[1].item()

                # Keep the first occurrence and mark subsequent duplicates
                if duplicate_idx in first_occurrences:
                    duplicated_indexes.add(global_idx)
                else:
                    first_occurrences.add(global_idx)
                    
    return duplicated_indexes

In [64]:
duplicated_indexes = find_duplicates_cosine(embeddings)
len(duplicated_indexes)

100%|██████████| 6189/6189 [03:20<00:00, 30.81it/s]


228651

In [68]:
df_deduplicated = dedup_df.drop(index=list(duplicated_indexes)).reset_index(drop=True)


In [72]:
df_deduplicated.to_json("deduplicated_augumented_problems.jsonl", orient="records", lines=True)