## Filter prompts using sklearn sentence_transformers pytorch_cos_sim

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

input_csv = "midjourney_prompts_filtered.csv"
output_csv = "filtered_prompts.csv"

df = pd.read_csv(input_csv)
prompts = df.iloc[:, 0].tolist()

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(prompts)

similarity_matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy()

similar_prompts = np.zeros(len(prompts), dtype=bool)

mask = np.eye(len(prompts), dtype=bool)

for i, prompt in enumerate(prompts):
    similar_indices = np.where((similarity_matrix[i] > 0.8) & ~mask[i])[0]

    if similar_indices.size > 0:
        similar_prompts[similar_indices] = True

not_similar_prompts = ~similar_prompts

filtered_prompts = np.array(prompts)[not_similar_prompts]

filtered_prompts_df = pd.DataFrame(filtered_prompts, columns=[df.columns[0]])
filtered_prompts_df.to_csv(output_csv, index=False)
