In [2]:
import sys
sys.path.append("../src")

from preprocessing import preprocess_text
from embeddings import embed_texts
import numpy as np

# English + French sentence pairs
sentences = [
    "Machine learning is transforming computer science.",
    "L'apprentissage automatique transforme l'informatique.",
    "I like playing football.",
    "J'aime jouer au football.",
    "The weather is nice today.",
    "Il fait beau aujourd'hui.",
]

# Preprocess each sentence using our custom pipeline
cleaned = [
    preprocess_text(s, lang="en" if i % 2 == 0 else "fr")
    for i, s in enumerate(sentences)
]

print("Cleaned sentences:")
for c in cleaned:
    print("-", c)

# Generate embeddings
embeddings = embed_texts(cleaned)

# Cosine similarity function
def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare each English sentence with its French counterpart
print("\nSimilarity scores:")
for i in range(0, len(sentences), 2):
    sim = cosine_sim(embeddings[i], embeddings[i+1])
    print(f"{sentences[i]}  <->  {sentences[i+1]}")
    print("  → similarity:", round(sim, 4), "\n")


Cleaned sentences:
- machine learning transform computer science
- lapprentissage automatique transformer linformatique
- I like play football
- jaime jouer football
- weather nice today
- faire beau aujourdhui

Similarity scores:
Machine learning is transforming computer science.  <->  L'apprentissage automatique transforme l'informatique.
  → similarity: 0.8939 

I like playing football.  <->  J'aime jouer au football.
  → similarity: 0.5398 

The weather is nice today.  <->  Il fait beau aujourd'hui.
  → similarity: 0.5748 

