# Semantic Similarity Matching
## Find semantically similar MWEs across languages

This notebook matches English idioms with foreign language MWEs using multilingual embeddings.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from python.similarity.semantic_matcher import SemanticMatcher
from python.data_processing.idiom_loader import IdiomLoader
from python.config import *

%matplotlib inline

## 1. Load English Idioms

In [None]:
# Load English idiom corpus
idioms_data = IdiomLoader.load_idiom_corpus(ENGLISH_IDIOMS_DIR)
english_idioms = [item['text'] for item in idioms_data]

print(f"Loaded {len(english_idioms)} English idioms")
print(f"\nSample idioms:")
for idiom in english_idioms[:5]:
    print(f"  - {idiom}")

## 2. Load Foreign MWEs

In [None]:
# Select language to match against
LANGUAGE = 'spanish'  # Change to 'hindi' or other languages

# Load extracted MWEs from previous notebook
mwes_file = PROCESSED_DATA_DIR / f"{LANGUAGE}_mwes.csv"
mwes_df = pd.read_csv(mwes_file)

# Get list of MWEs
foreign_mwes = mwes_df['mwe'].tolist()

print(f"Loaded {len(foreign_mwes)} {LANGUAGE} MWEs")
print(f"\nSample MWEs:")
for mwe in foreign_mwes[:5]:
    print(f"  - {mwe}")

## 3. Initialize Semantic Matcher

In [None]:
# Initialize matcher with multilingual model
matcher = SemanticMatcher(model_name=EMBEDDING_MODEL)

print(f"Using model: {EMBEDDING_MODEL}")

## 4. Find Semantic Matches

In [None]:
# Find similar MWEs for each English idiom
print("Computing semantic similarities...")
matches = matcher.find_similar_mwes(
    english_idioms=english_idioms,
    foreign_mwes=foreign_mwes,
    threshold=SIMILARITY_THRESHOLD,
    top_k=5
)

print(f"\nFound matches for {len(matches)} English idioms")

## 5. Analyze Results

In [None]:
# Display top matches
print("\nTop matches:\n")
for i, (english_idiom, matched_mwes) in enumerate(list(matches.items())[:10]):
    print(f"{i+1}. English: '{english_idiom}'")
    for mwe, score in matched_mwes:
        print(f"   - {LANGUAGE.capitalize()}: '{mwe}' (similarity: {score:.3f})")
    print()

In [None]:
# Analyze similarity score distribution
all_scores = []
for matched_mwes in matches.values():
    all_scores.extend([score for _, score in matched_mwes])

plt.figure(figsize=(10, 5))
plt.hist(all_scores, bins=30, edgecolor='black')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.title('Distribution of Similarity Scores')
plt.axvline(SIMILARITY_THRESHOLD, color='red', linestyle='--', label='Threshold')
plt.legend()
plt.show()

print(f"\nScore statistics:")
print(f"  Mean: {pd.Series(all_scores).mean():.3f}")
print(f"  Median: {pd.Series(all_scores).median():.3f}")
print(f"  Std: {pd.Series(all_scores).std():.3f}")

## 6. Save Results

In [None]:
# Convert matches to DataFrame for easier analysis and export
results_data = []

for english_idiom, matched_mwes in matches.items():
    for mwe, score in matched_mwes:
        results_data.append({
            'english_idiom': english_idiom,
            'foreign_mwe': mwe,
            'language': LANGUAGE,
            'similarity_score': score
        })

results_df = pd.DataFrame(results_data)

# Save to CSV
output_file = RESULTS_DIR / f"idiom_mwe_matches_{LANGUAGE}.csv"
results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Save to JSON for web visualization
json_output = RESULTS_DIR / f"idiom_mwe_matches_{LANGUAGE}.json"
with open(json_output, 'w', encoding='utf-8') as f:
    json.dump({
        'language': LANGUAGE,
        'total_matches': len(results_df),
        'matches': results_data
    }, f, ensure_ascii=False, indent=2)
print(f"JSON results saved to: {json_output}")

## 7. Interactive Exploration

In [None]:
# Search for specific idiom
def search_idiom(query: str):
    """Search for matches of a specific English idiom"""
    if query in matches:
        print(f"Matches for '{query}':\n")
        for mwe, score in matches[query]:
            print(f"  - {mwe} (score: {score:.3f})")
    else:
        print(f"No matches found for '{query}'")

# Example: search for a specific idiom
# search_idiom("break the ice")