## KNN

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.neighbors import NearestNeighbors
import torch

input_csv = "midjourney_prompts.csv"
output_csv = "filtered_prompts.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(input_csv)
prompts = df.iloc[:, 0].tolist()

model = SentenceTransformer("all-MiniLM-L6-v2").to(device)
embeddings = model.encode(prompts)

knn_model = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='auto', n_jobs=-1)
knn_model.fit(embeddings)

# Find the nearest neighbors for each point
distances, _ = knn_model.kneighbors(embeddings)

not_similar_prompts = distances[:, 1] > 0.9

filtered_prompts = np.array(prompts)[not_similar_prompts]

filtered_prompts_df = pd.DataFrame(filtered_prompts, columns=[df.columns[0]])
filtered_prompts_df.to_csv(output_csv, index=False)

## FAISS KNN (FASTER)

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
import faiss # FAISS-GPU BEST

input_csv = "midjourney_prompts.csv"
output_csv = "filtered_image_data2.csv"
threshold = 0.1

df = pd.read_csv(input_csv)
prompts = df['text'].tolist()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

embeddings = model.encode(prompts)
embeddings = np.array(embeddings) # for Faiss

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# search the nearest neighbors
distances, neighbors = index.search(embeddings, 2)

# Check if the distance to the nearest neighbor (excluding itself) is greater than the threshold
not_sim_prompts = (1 - distances[:, 1]) > threshold

filtered_image_data = df.loc[not_sim_prompts]

filtered_image_data.to_csv(output_csv, index=False)

## SKLEARN (SLOWER) -- USE WITH CUDA

In [None]:
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('midjourney_prompts.csv')

model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

prompts = df['text'].tolist()
embeddings = model.encode(prompts)

cosine_sim_matrix = cosine_similarity(embeddings)

np.fill_diagonal(cosine_sim_matrix, 0)

filtered_indices = set()
for i in range(len(prompts)):
    for j in range(i+1, len(prompts)):
        if cosine_sim_matrix[i][j] >= 0.9:
            filtered_indices.add(j)

filtered_df = df.loc[~df.index.isin(filtered_indices)]

filtered_df.to_csv('filtered_midjourney.csv', index=False)