In [4]:
from google.colab import files
import io
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import umap
import plotly.express as px
uploaded = files.upload()

Saving es7k5_with_en.csv to es7k5_with_en.csv


In [5]:
INPUT_CSV = next(iter(uploaded.keys()))
print(f"\nSuccessfully uploaded: {INPUT_CSV}")
df = pd.read_csv(io.BytesIO(uploaded[INPUT_CSV]))
print("DataFrame loaded successfully.")


Successfully uploaded: es7k5_with_en.csv
DataFrame loaded successfully.


In [6]:
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

print(f"\nLoading model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME, device="cuda")
print("Model loaded successfully.")

# Prepare text data from the DataFrame
texts_es = df["content"].fillna("").astype(str).tolist()
texts_en = df["content_en"].fillna("").astype(str).tolist()

# Generate embeddings
print("\nGenerating embeddings with MiniLM for Spanish texts...")
emb_es = model.encode(texts_es, batch_size=32, show_progress_bar=True)

print("Generating embeddings with MiniLM for their English translations...")
emb_en = model.encode(texts_en, batch_size=32, show_progress_bar=True)



Loading model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2...


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully.

Generating embeddings with MiniLM for Spanish texts...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Generating embeddings with MiniLM for their English translations...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

In [7]:
# Calculate cosine similarity between pairs
numerator = np.sum(emb_es * emb_en, axis=1)
denominator = np.linalg.norm(emb_es, axis=1) * np.linalg.norm(emb_en, axis=1)
df["similarity_minilm"] = numerator / (denominator + 1e-9)

# Add embeddings to the DataFrame
df["embedding_es_minilm"] = [v.tolist() for v in emb_es]
df["embedding_en_minilm"] = [v.tolist() for v in emb_en]

# --- Display Key Quantitative Results for MiniLM ---
sim_minilm = df["similarity_minilm"]
print("\n--- Cosine Similarity Summary (MiniLM on Spanish) ---")
print(sim_minilm.describe())

threshold = 0.75
count_below_threshold = (sim_minilm < threshold).sum()
print(f"\nTotal rows with similarity < {threshold}: {count_below_threshold} / {len(sim_minilm)}")
print("---------------------------------------------------")

# Save the complete DataFrame to a Parquet file
OUTPUT_FILENAME = f"{INPUT_CSV.split('.')[0]}_with_minilm_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME}...")
df.to_parquet(OUTPUT_FILENAME, index=False)




--- Cosine Similarity Summary (MiniLM on Spanish) ---
count    6336.000000
mean        0.832475
std         0.110748
min         0.187043
25%         0.771517
50%         0.861808
75%         0.917192
max         1.000000
Name: similarity_minilm, dtype: float64

Total rows with similarity < 0.75: 1311 / 6336
---------------------------------------------------

Saving results to es7k5_with_en_with_minilm_embeddings.parquet...


In [8]:
# UMAP Calculation using the correct 'concatenate' method
all_emb = np.concatenate([emb_es, emb_en], axis=0)

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced = reducer.fit_transform(all_emb)

n = len(df)
es_2d = reduced[:n]
en_2d = reduced[n:]
print("UMAP process complete.")


# Prepare a new DataFrame specifically for plotting
print("Preparing data for plotting...")
plot_df = pd.DataFrame({
    'umap_x': np.concatenate([es_2d[:, 0], en_2d[:, 0]]),
    'umap_y': np.concatenate([es_2d[:, 1], en_2d[:, 1]]),
    'language': ['Spanish'] * n + ['English'] * n,
    'url': pd.concat([df['url'], df['url']]).tolist(),
    'title': pd.concat([df['title'], df['title']]).tolist(),
    'similarity_minilm': pd.concat([df['similarity_minilm'], df['similarity_minilm']]).tolist()
})


# Generate the Final Interactive Plot
print("Generating interactive plot...")
fig = px.scatter(
    plot_df,
    x='umap_x',
    y='umap_y',
    color='language',
    opacity=0.2,
    hover_data={
        'url': True,
        'title': True,
        'similarity_minilm': ':.4f',
        'language': False,
        'umap_x': False,
        'umap_y': False,
    },
    title="Interactive UMAP of Spanish & English Embeddings (MiniLM)"
)

fig.update_layout(
    template='plotly_white',
    width=900,
    height=700,
    legend_title_text='Language',
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2"
)
fig.update_traces(marker=dict(size=5))
fig.show()

  warn(


UMAP process complete.
Preparing data for plotting...
Generating interactive plot...


In [9]:
uploaded_zh = files.upload()

Saving zh7k5_with_en.csv to zh7k5_with_en (1).csv


In [10]:
INPUT_CSV_ZH = next(iter(uploaded_zh.keys()))
print(f"\nSuccessfully uploaded: {INPUT_CSV_ZH}")
df_zh = pd.read_csv(io.BytesIO(uploaded_zh[INPUT_CSV_ZH]))
print("Chinese DataFrame loaded successfully.")
texts_zh = df_zh["content"].fillna("").astype(str).tolist()
texts_en_from_zh = df_zh["content_en"].fillna("").astype(str).tolist()

# Generate embeddings
print("\nGenerating embeddings with MiniLM for Chinese texts...")
emb_zh = model.encode(texts_zh, batch_size=32, show_progress_bar=True)

print("Generating embeddings with MiniLM for their English translations...")
emb_en_from_zh = model.encode(texts_en_from_zh, batch_size=32, show_progress_bar=True)



Successfully uploaded: zh7k5_with_en (1).csv
Chinese DataFrame loaded successfully.

Generating embeddings with MiniLM for Chinese texts...


Batches:   0%|          | 0/140 [00:00<?, ?it/s]

Generating embeddings with MiniLM for their English translations...


Batches:   0%|          | 0/140 [00:00<?, ?it/s]

In [11]:
# Calculate cosine similarity between pairs
numerator_zh = np.sum(emb_zh * emb_en_from_zh, axis=1)
denominator_zh = np.linalg.norm(emb_zh, axis=1) * np.linalg.norm(emb_en_from_zh, axis=1)
df_zh["similarity_minilm"] = numerator_zh / (denominator_zh + 1e-9)

# Add embeddings to the DataFrame
df_zh["embedding_zh_minilm"] = [v.tolist() for v in emb_zh]
df_zh["embedding_en_minilm"] = [v.tolist() for v in emb_en_from_zh]

# --- Display Key Quantitative Results for MiniLM on Chinese Data ---
sim_zh = df_zh["similarity_minilm"]
print("\n--- Cosine Similarity Summary (MiniLM on Chinese) ---")
print(sim_zh.describe())

threshold = 0.75
count_below_threshold_zh = (sim_zh < threshold).sum()
print(f"\nTotal rows with similarity < {threshold}: {count_below_threshold_zh} / {len(sim_zh)}")
print("-----------------------------------------------------")

# Save the complete DataFrame to a Parquet file
OUTPUT_FILENAME_ZH = f"{INPUT_CSV_ZH.split('.')[0]}_with_minilm_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME_ZH}...")
df_zh.to_parquet(OUTPUT_FILENAME_ZH, index=False)


--- Cosine Similarity Summary (MiniLM on Chinese) ---
count    4458.000000
mean        0.178213
std         0.116203
min        -0.036071
25%         0.114589
50%         0.162510
75%         0.214170
max         0.943906
Name: similarity_minilm, dtype: float64

Total rows with similarity < 0.75: 4405 / 4458
-----------------------------------------------------

Saving results to zh7k5_with_en (1)_with_minilm_embeddings.parquet...


In [12]:
all_emb_zh = np.concatenate([emb_zh, emb_en_from_zh], axis=0)

reducer_zh = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced_zh = reducer_zh.fit_transform(all_emb_zh)

n_zh = len(df_zh)
zh_2d = reduced_zh[:n_zh]
en_from_zh_2d = reduced_zh[n_zh:]
print("UMAP process complete.")


# Prepare a new DataFrame specifically for plotting
print("Preparing data for plotting...")
plot_df_zh = pd.DataFrame({
    'umap_x': np.concatenate([zh_2d[:, 0], en_from_zh_2d[:, 0]]),
    'umap_y': np.concatenate([zh_2d[:, 1], en_from_zh_2d[:, 1]]),
    'language': ['Chinese'] * n_zh + ['English'] * n_zh,
    'url': pd.concat([df_zh['url'], df_zh['url']]).tolist(),
    'title': pd.concat([df_zh['title'], df_zh['title']]).tolist(),
    'similarity_minilm': pd.concat([df_zh['similarity_minilm'], df_zh['similarity_minilm']]).tolist()
})


# Generate the Final Interactive Plot
print("Generating interactive plot...")
fig_zh = px.scatter(
    plot_df_zh,
    x='umap_x',
    y='umap_y',
    color='language',
    opacity=0.2,
    hover_data={
        'url': True,
        'title': True,
        'similarity_minilm': ':.4f',
        'language': False,
        'umap_x': False,
        'umap_y': False,
    },
    title="Interactive UMAP of Chinese & English Embeddings (MiniLM)"
)

fig_zh.update_layout(
    template='plotly_white',
    width=900,
    height=700,
    legend_title_text='Language',
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2"
)
fig_zh.update_traces(marker=dict(size=5))
fig_zh.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP process complete.
Preparing data for plotting...
Generating interactive plot...
