In [5]:
pip install -U sentence-transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
words = ["cat","dog"]

In [7]:
embeddings = model.encode(words)
similarities = model.similarity(embeddings, embeddings)
similarities

tensor([[1.0000, 0.6606],
        [0.6606, 1.0000]])

In [8]:
!pip install nltk scikit-learn numpy sentence-transformers matplotlib seaborn wordcloud


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [10]:
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    return lines

In [11]:
prompts=load_text_file("/datasets/data/prompts.txt")
responses= load_text_file("/datasets/data/responses.txt")

In [12]:
import re
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer

def analyze_lengths(promptss, responsess):
    lengths_prompts = [len(word_tokenize(p)) for p in promptss]
    lengths_responses = [len(word_tokenize(r)) for r in responsess]
    
    min_idx = np.argmin(lengths_responses)
    max_idx = np.argmax(lengths_responses)

    print(f"Réponse la plus courte ({lengths_responses[min_idx]} mots) : {responses[min_idx]}")
    print(f"Réponse la plus longue ({lengths_responses[max_idx]} mots) : {responses[max_idx]}")

    ratios = [lr / lp if lp != 0 else 0 for lp, lr in zip(lengths_prompts, lengths_responses)]
    
    print(f"Longueur moyenne des prompts : {np.mean(lengths_prompts):.2f} mots")
    print(f"Longueur moyenne des réponses : {np.mean(lengths_responses):.2f} mots")
    print(f"Ratio moyen réponse/prompt : {np.mean(ratios):.2f}")

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
analyze_lengths(prompts, responses)

Réponse la plus courte (1 mots) : -Oman
Réponse la plus longue (514 mots) : (321983, """The night sky was a quilt of twinkling lights.  ;  The night sky was a canopy of diamond dust.  ;  The night sky was a tapestry of bright shining stars.  ;  The night sky was a sea of shimmering lights.  ;  The night sky was a velvet cloak of stars.  ;  The night sky was a carpet of glittering jewels.  ;   The night sky was a sky of diamond stars.  ;  The night sky was a sky of shining stars.  ;  The night sky was a bed of diamond stars.  ;  The night sky was a field of stars.  ;  The night sky was a blanket of stars.  ;  The night sky was a galaxy of sparkling stars.  ;  The night sky was a glittering blanket of stars.  ;  The night sky was a field of diamond stars.  ;  The night sky was a cloak of glowing stars.  ;  The night sky was a sky of sparkling stars.  ;  The night sky was a night of sparkling stars.  ;  The night sky was a million specks of starlight.  ;  The night sky was a sky of lumine

In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Supprime les caractères spéciaux
    return text


In [15]:
# Calcul du TF-IDF
def compute_tfidf(prompts, responses):
    stop_words = set(stopwords.words('english'))
    
    def remove_stopwords(text):
        words = word_tokenize(text)
        return ' '.join([word for word in words if word.lower() not in stop_words])
    
    clean_prompts = [remove_stopwords(clean_text(p)) for p in prompts]
    clean_responses = [remove_stopwords(clean_text(r)) for r in responses]

    vectorizer = TfidfVectorizer()
    all_texts = clean_prompts + clean_responses
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    return tfidf_matrix, vectorizer


In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

# Calcul de la similarité sémantique
def compute_semantic_similarity(prompts, responses):
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Petit modèle rapide et efficace
    embeddings_prompts = model.encode(prompts, convert_to_tensor=True)
    embeddings_responses = model.encode(responses, convert_to_tensor=True)

    similarities = cosine_similarity(embeddings_prompts.cpu(), embeddings_responses.cpu())

    max_sim_idx = np.argmax(np.diag(similarities))
    min_sim_idx = np.argmin(np.diag(similarities))

    print(f"Meilleure correspondance prompt-réponse (sim = {similarities[max_sim_idx, max_sim_idx]:.2f}):")
    print(f" Prompt: {prompts[max_sim_idx]}")
    print(f"  Réponse: {responses[max_sim_idx]}\n")

    print(f"❌ Pire correspondance prompt-réponse (sim = {similarities[min_sim_idx, min_sim_idx]:.2f}):")
    print(f"   🔴 Prompt: {prompts[min_sim_idx]}")
    print(f"   🔴 Réponse: {responses[min_sim_idx]}\n")


tfidf_matrix, vectorizer = compute_tfidf(prompts, responses)

compute_semantic_similarity(prompts, responses)
