In [121]:
import spacy
import cupy as cp
from sklearn.metrics.pairwise import cosine_similarity
import time
nlp = spacy.load("nl_core_news_lg")

In [122]:
def group_similar_sentences(text, similarity_threshold=0.75):
    doc = nlp(text)
    
    valid_sentences = []
    valid_embeddings = []

    # Extract embeddings for valid sentences
    for sent in doc.sents:
        embedding = cp.array(sent.vector)  # Convert to CuPy array
        if not cp.all(embedding == 0):  # Check if the embedding is not zero
            valid_sentences.append(sent.text.strip())
            valid_embeddings.append(embedding)

    # Convert valid embeddings to a CuPy array
    valid_embeddings = cp.array(valid_embeddings)

    if valid_embeddings.shape[0] == 0:
        return ["No valid sentences with embeddings found."]
    
    # Compute cosine similarities using CuPy
    # Normalize the embeddings
    norms = cp.linalg.norm(valid_embeddings, axis=1, keepdims=True)
    normalized_embeddings = valid_embeddings / norms  # Avoid division by zero
    similarities = cp.dot(normalized_embeddings, normalized_embeddings.T)

    # Ensure the diagonal is zero for comparison
    cp.fill_diagonal(similarities, 0)

    grouped_sentences = []
    current_group = [valid_sentences[0]]

    # Group sentences based on similarity
    for i in range(1, len(valid_sentences)):
        similarity = similarities[0, i]  # Compare with the first sentence in the current group

        if similarity >= similarity_threshold:
            current_group.append(valid_sentences[i])
        else:
            grouped_sentences.append(current_group)
            current_group = [valid_sentences[i]]

    grouped_sentences.append(current_group)

    return grouped_sentences

In [123]:
text = """De appel valt van de boom. De appel is groen."""

In [124]:

start_time = time.time()
grouped_sentences = group_similar_sentences(text)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time:.4f} seconds")

for i, group in enumerate(grouped_sentences, 1):
    print(f"Groep {i}:")
    for sentence in group:
        print(f" - {sentence}")

Execution Time: 0.0087 seconds
Groep 1:
 - De appel valt van de boom.
 - De appel is groen.
