### FILTER REDUNDANT DATA USING COSINE SIM

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

device = "mps"

# Load new JSON data
with open('unique_.json', 'r') as f:
    new_data = json.load(f)

# Function to combine instruction and input
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

# Extract instructions
new_instructions = combine_instruction_input(new_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
new_embeddings = model.encode(new_instructions)

# Initialize empty list
final_data = []
existing_embeddings = []

# For each new instruction, check if it's sufficiently different from existing instructions
for i, new_instruction in enumerate(new_instructions):
    # If list is empty, add the first datapoint
    if not final_data:
        final_data.append(new_data[i])
        existing_embeddings.append(new_embeddings[i])
    else:
        # Compute similarity scores with existing instructions
        similarity_scores = cosine_similarity([new_embeddings[i]], existing_embeddings)

        # If new instruction is sufficiently different, add it to the final_data
        if np.max(similarity_scores) <= 0.7:
            final_data.append(new_data[i])
            existing_embeddings.append(new_embeddings[i])

# Save the final_data to a new json file
with open('unique_data_best.json', 'w') as f:
    json.dump(final_data, f, indent=1)

### FILTER DATA SIMILAR TO TEST SET USING COSINE SIM

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np

device = "cuda" # "mps" and "cpu" also work

# Load training and test JSON data
with open('merged_deduped.json', 'r') as f:
    training_data = json.load(f)

with open('test_strings.json', 'r') as f:
    test_data = json.load(f)

# Function to combine instruction and input (if your data structure is the same)
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

training_instructions = combine_instruction_input(training_data)
test_instructions = combine_instruction_input(test_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
training_embeddings = model.encode(training_instructions)
test_embeddings = model.encode(test_instructions)

# Set to store training indices with similarity greater than 0.8
high_similarity_indices = set()

for i, test_instruction in enumerate(test_instructions):
    scores = cosine_similarity([test_embeddings[i]], training_embeddings)
    max_score_index = np.argmax(scores)
    max_score = np.max(scores)

    if max_score >= 0.8:
        high_similarity_indices.add(max_score_index)

# Create a new list that excludes the training examples with high similarity
filtered_training_data = [d for i, d in enumerate(training_data) if i not in high_similarity_indices]

# Now, filtered_training_data contains the filtered training examples
with open('filtered_training_data.json', 'w') as f:
    json.dump(filtered_training_data, f, indent=1)

### SAVE HIGH SIMILARITY PAIRS FOR REVIEW

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
import csv

device = "cuda" # "mps" and "cpu" also work

# load training and test JSON data
with open('MATH_train_data.json', 'r') as f:
    training_data = json.load(f)

with open('all_test_df.json', 'r') as f:
    test_data = json.load(f)

# Function to combine instruction and input (if your data structure is the same)
def combine_instruction_input(data):
    instructions = []
    for d in data:
        instruction = d['instruction']
        input_text = d['input']
        if input_text != '':
            instruction += ' ' + input_text
        instructions.append(instruction)
    return instructions

# Extract training and test instructions
training_instructions = combine_instruction_input(training_data)
test_instructions = combine_instruction_input(test_data)

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Compute embeddings
training_embeddings = model.encode(training_instructions)
test_embeddings = model.encode(test_instructions)

high_similarity_pairs = {}

# For each test instruction, check if it's sufficiently similar to training instructions
for i, test_instruction in enumerate(test_instructions):
    scores = cosine_similarity([test_embeddings[i]], training_embeddings)
    max_score_index = np.argmax(scores)
    max_score = np.max(scores)

    if max_score >= 0.8:
        high_similarity_pairs[test_instruction] = training_instructions[max_score_index]

# Print the pairs with high similarity
for test_instruction, train_instruction in high_similarity_pairs.items():
    print(f"Test Instruction: {test_instruction} \nTrain Instruction: {train_instruction}\n")

# Save high similarity pairs to CSV file
with open('data_leak_checks/similarity_pairs_math.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Test Instruction', 'Train Instruction'])  # Header row
    for test_instruction, train_instruction in high_similarity_pairs.items():
        writer.writerow([test_instruction, train_instruction])

Test Instruction: At breakfast, lunch, and dinner, Joe randomly chooses with equal probabilities either an apple, an orange, or a banana to eat. On a given day, what is the probability that Joe will eat at least two different kinds of fruit?
A: \frac{7}{9}
B: \frac{8}{9}
C: \frac{5}{9}
D: \frac{9}{11} 
Train Instruction: At breakfast, lunch, and dinner, Joe randomly chooses with equal probabilities either an apple, an orange, or a banana to eat. On a given day, what is the probability that Joe will eat at least two different kinds of fruit?

Test Instruction: What is the remainder when $2^{87} +3$ is divided by $7$?
A: 0
B: 1
C: 2
D: 4 
Train Instruction: What is the remainder when $2^{87} +3$ is divided by $7$?

Test Instruction: Jane's quiz scores were 98, 97, 92, 85 and 93. What was her mean score?
A: 92
B: 93
C: 94.5
D: 95 
Train Instruction: Jane's quiz scores were 98, 97, 92, 85 and 93. What was her mean score?

Test Instruction: What is the inverse of $f(x)=4-5x$?
A: \frac{5}{x-

In [2]:
len(high_similarity_pairs)

80