In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.stats import pearsonr, spearmanr
import numpy as np




In [5]:
df = pd.read_csv("data\huggingface-2025-10-14.csv")
df.head()

  df = pd.read_csv("data\huggingface-2025-10-14.csv")
  df = pd.read_csv("data\huggingface-2025-10-14.csv")


Unnamed: 0,score,sentence1,sentence2,score_binary
0,5.0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,1.0
1,4.75,A young child is riding a horse.,A child is riding a horse.,1.0
2,5.0,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,1.0
3,2.4,A woman is playing the guitar.,A man is playing guitar.,0.0
4,2.75,A woman is playing the flute.,A man is playing a flute.,0.0


In [6]:
df = df[:1500]

In [7]:
def jaccard_similarity(sentence1, sentence2):
    set1 = set(sentence1.lower().split())
    set2 = set(sentence2.lower().split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [9]:
# --- Load model ---
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
# --- Prepare storage ---
cosine_scores = []
jaccard_scores = []
euclidean_dist = []

# --- Loop through pairs ---
total_pairs = len(df)
for i, (s1, s2) in enumerate(zip(df['sentence1'], df['sentence2']), start=1):
    # Encode & compute cosine similarity
    emb1 = model.encode([s1], normalize_embeddings=True)
    emb2 = model.encode([s2], normalize_embeddings=True)

    sim_cos = cosine_similarity(emb1, emb2)[0][0]
    cosine_scores.append(sim_cos)

    # Compute Jaccard similarity
    sim_jaccard = jaccard_similarity(s1, s2)
    jaccard_scores.append(sim_jaccard)

    # Compute Euclidean distance and its similarity
    dist_euc = euclidean_distances(emb1, emb2)[0][0]
    euclidean_dist.append(dist_euc)

    # Print progress every 100 iterations or at the end
    if i % 100 == 0 or i == total_pairs:
        print(f"Processed {i}/{total_pairs} sentence pairs...")


min_dist = np.min(euclidean_dist)
max_dist = np.max(euclidean_dist)

euclidean_scores = [1 - (d - min_dist) / (max_dist - min_dist) for d in euclidean_dist]
cosine_scores_norm = [(s + 1) / 2 for s in cosine_scores]

# --- Store results in dataframe ---
df['cosine_score'] = cosine_scores_norm
df['cosine_score_scaled'] = df['cosine_score'] * 5  # Scale to match gold score scale 

df['jaccard_score'] = jaccard_scores
df['jaccard_score_scaled'] = df['jaccard_score'] * 5

df['euclidean_score'] = euclidean_scores
df['euclidean_score_scaled'] = df['euclidean_score'] * 5

# --- Prepare arrays ---
gold_scores = df['score'].values
pred_cosine = df['cosine_score_scaled'].values
pred_jaccard = df['jaccard_score_scaled'].values
pred_euclidean = df['euclidean_score_scaled'].values

# --- Compute correlations ---
pearson_cosine, _ = pearsonr(gold_scores, pred_cosine)
spearman_cosine, _ = spearmanr(gold_scores, pred_cosine)

pearson_jaccard, _ = pearsonr(gold_scores, pred_jaccard)
spearman_jaccard, _ = spearmanr(gold_scores, pred_jaccard)

pearson_euc, _ = pearsonr(gold_scores, pred_euclidean)
spearman_euc, _ = spearmanr(gold_scores, pred_euclidean)

# --- Print results ---
print("\n=== Correlation Results ===")
print(f"✅ Cosine  - Pearson: {pearson_cosine:.4f} | Spearman: {spearman_cosine:.4f}")
print(f"✅ Jaccard - Pearson: {pearson_jaccard:.4f} | Spearman: {spearman_jaccard:.4f}")
print(f"✅ Euclidean - Pearson: {pearson_euc:.4f} | Spearman: {spearman_euc:.4f}")


Processed 100/1500 sentence pairs...
Processed 200/1500 sentence pairs...
Processed 300/1500 sentence pairs...
Processed 400/1500 sentence pairs...
Processed 500/1500 sentence pairs...
Processed 600/1500 sentence pairs...
Processed 700/1500 sentence pairs...
Processed 800/1500 sentence pairs...
Processed 900/1500 sentence pairs...
Processed 1000/1500 sentence pairs...
Processed 1100/1500 sentence pairs...
Processed 1200/1500 sentence pairs...
Processed 1300/1500 sentence pairs...
Processed 1400/1500 sentence pairs...
Processed 1500/1500 sentence pairs...

=== Correlation Results ===
✅ Cosine  - Pearson: 0.8696 | Spearman: 0.8672
✅ Jaccard - Pearson: 0.5960 | Spearman: 0.6017
✅ Euclidean - Pearson: 0.8679 | Spearman: 0.8672
