In [2]:
from google.colab import files
import io
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import umap
import plotly.express as px
uploaded = files.upload()

Saving es7k5_with_en.csv to es7k5_with_en.csv


In [3]:
INPUT_CSV = next(iter(uploaded.keys()))
print(f"\nSuccessfully uploaded: {INPUT_CSV}")
df = pd.read_csv(io.BytesIO(uploaded[INPUT_CSV]))
# mean pooling layer
def get_mean_embeddings(texts, model_name="bert-base-multilingual-cased", batch_size=32, max_length=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    mean_embeddings = []

    print(f"\nGenerating Mean-Pooled embeddings with {model_name}...")
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
            batch_texts = texts[i:i+batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden = outputs.last_hidden_state

            # Mean Pooling calculation
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
            sum_embeddings = torch.sum(last_hidden * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            mean_embed = sum_embeddings / torch.clamp(sum_mask, min=1e-9)
            mean_embeddings.extend(mean_embed.cpu().numpy())

    return np.array(mean_embeddings)




Successfully uploaded: es7k5_with_en.csv


In [4]:
texts_es = df["content"].fillna("").astype(str).tolist()
texts_en = df["content_en"].fillna("").astype(str).tolist()

# Get Spanish embeddings
emb_es_mean = get_mean_embeddings(texts_es)

# Get English embeddings
emb_en_mean = get_mean_embeddings(texts_en)

# Add embeddings to DataFrame
df["embedding_es_mean"] = [v.tolist() for v in emb_es_mean]
df["embedding_en_mean"] = [v.tolist() for v in emb_en_mean]

# Calculate cosine similarity
def compute_cosine_sim(a, b):
    dot = np.sum(a * b, axis=1)
    norm = np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1)
    return dot / (norm + 1e-9)

df["similarity_mean"] = compute_cosine_sim(emb_es_mean, emb_en_mean)

# --- Display Key Quantitative Results ---
print("\n--- Cosine Similarity Summary (mBERT Mean Pooling) ---")
print(df["similarity_mean"].describe())

# Save the complete DataFrame to a Parquet file
OUTPUT_FILENAME = f"{INPUT_CSV.split('.')[0]}_with_mbert_mean_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME}...")
df.to_parquet(OUTPUT_FILENAME, index=False)
print("Save complete.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]


Generating Mean-Pooled embeddings with bert-base-multilingual-cased...


Embedding: 100%|██████████| 198/198 [03:09<00:00,  1.04it/s]



Generating Mean-Pooled embeddings with bert-base-multilingual-cased...


Embedding: 100%|██████████| 198/198 [03:04<00:00,  1.07it/s]



--- Cosine Similarity Summary (mBERT Mean Pooling) ---
count    6336.000000
mean        0.788277
std         0.074147
min         0.370406
25%         0.735378
50%         0.778449
75%         0.854863
max         1.000000
Name: similarity_mean, dtype: float64

Saving results to es7k5_with_en_with_mbert_mean_embeddings.parquet...
Save complete.


In [5]:

all_emb_mean = np.concatenate([emb_es_mean, emb_en_mean], axis=0)
reducer_mean = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced_mean = reducer_mean.fit_transform(all_emb_mean)

n = len(df)
es_2d_mean = reduced_mean[:n]
en_2d_mean = reduced_mean[n:]

plot_df_mean = pd.DataFrame({
    'umap_x': np.concatenate([es_2d_mean[:, 0], en_2d_mean[:, 0]]),
    'umap_y': np.concatenate([es_2d_mean[:, 1], en_2d_mean[:, 1]]),
    'language': ['Spanish'] * n + ['English'] * n,
    'url': pd.concat([df['url'], df['url']]).tolist(),
    'title': pd.concat([df['title'], df['title']]).tolist(),
    'similarity_mean': pd.concat([df['similarity_mean'], df['similarity_mean']]).tolist()
})

fig_mean = px.scatter(
    plot_df_mean, x='umap_x', y='umap_y', color='language', opacity=0.2,
    hover_data={'url': True, 'title': True, 'similarity_mean': ':.4f'},
    title="Interactive UMAP of Spanish & English Embeddings (mBERT - Mean Pooling)"
)
fig_mean.update_layout(template='plotly_white', width=900, height=700)
fig_mean.show()

  warn(


In [6]:
uploaded_zh = files.upload()

Saving zh7k5_with_en.csv to zh7k5_with_en.csv


In [7]:
INPUT_CSV_ZH = next(iter(uploaded_zh.keys()))
print(f"\nSuccessfully uploaded: {INPUT_CSV_ZH}")
df_zh = pd.read_csv(io.BytesIO(uploaded_zh[INPUT_CSV_ZH]))


Successfully uploaded: zh7k5_with_en.csv


In [8]:
def get_mean_embeddings(texts, model_name="bert-base-multilingual-cased", batch_size=32, max_length=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    mean_embeddings = []

    print(f"\nGenerating Mean-Pooled embeddings with {model_name}...")
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
            batch_texts = texts[i:i+batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden = outputs.last_hidden_state

            # Mean Pooling calculation
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
            sum_embeddings = torch.sum(last_hidden * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            mean_embed = sum_embeddings / torch.clamp(sum_mask, min=1e-9)
            mean_embeddings.extend(mean_embed.cpu().numpy())

    return np.array(mean_embeddings)

In [9]:
texts_zh = df_zh["content"].fillna("").astype(str).tolist()
texts_en_from_zh = df_zh["content_en"].fillna("").astype(str).tolist()

# Get Chinese embeddings
emb_zh_mean = get_mean_embeddings(texts_zh)

# Get English embeddings for the translated Chinese texts
emb_en_from_zh_mean = get_mean_embeddings(texts_en_from_zh)

# Add embeddings to DataFrame
df_zh["embedding_zh_mean"] = [v.tolist() for v in emb_zh_mean]
df_zh["embedding_en_mean"] = [v.tolist() for v in emb_en_from_zh_mean]

# Calculate cosine similarity
def compute_cosine_sim(a, b):
    dot = np.sum(a * b, axis=1)
    norm = np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1)
    return dot / (norm + 1e-9)

df_zh["similarity_mean"] = compute_cosine_sim(emb_zh_mean, emb_en_from_zh_mean)

# --- Display Key Quantitative Results ---
print("\n--- Cosine Similarity Summary (mBERT Mean Pooling on Chinese) ---")
print(df_zh["similarity_mean"].describe())

# Save the complete DataFrame to a Parquet file
OUTPUT_FILENAME_ZH = f"{INPUT_CSV_ZH.split('.')[0]}_with_mbert_mean_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME_ZH}...")
df_zh.to_parquet(OUTPUT_FILENAME_ZH, index=False)
print("Save complete.")




Generating Mean-Pooled embeddings with bert-base-multilingual-cased...


Embedding: 100%|██████████| 140/140 [02:24<00:00,  1.03s/it]



Generating Mean-Pooled embeddings with bert-base-multilingual-cased...


Embedding: 100%|██████████| 140/140 [00:04<00:00, 28.41it/s] 



--- Cosine Similarity Summary (mBERT Mean Pooling on Chinese) ---
count    4458.000000
mean        0.126887
std         0.107639
min         0.051662
25%         0.100743
50%         0.110442
75%         0.120442
max         0.885240
Name: similarity_mean, dtype: float64

Saving results to zh7k5_with_en_with_mbert_mean_embeddings.parquet...
Save complete.


In [10]:

all_emb_zh = np.concatenate([emb_zh_mean, emb_en_from_zh_mean], axis=0)
reducer_zh = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced_zh = reducer_zh.fit_transform(all_emb_zh)

n_zh = len(df_zh)
zh_2d = reduced_zh[:n_zh]
en_2d = reduced_zh[n_zh:]

plot_df_zh = pd.DataFrame({
    'umap_x': np.concatenate([zh_2d[:, 0], en_2d[:, 0]]),
    'umap_y': np.concatenate([zh_2d[:, 1], en_2d[:, 1]]),
    'language': ['Chinese'] * n_zh + ['English'] * n_zh,
    'url': pd.concat([df_zh['url'], df_zh['url']]).tolist(),
    'title': pd.concat([df_zh['title'], df_zh['title']]).tolist(),
    'similarity_mean': pd.concat([df_zh['similarity_mean'], df_zh['similarity_mean']]).tolist()
})

fig_zh = px.scatter(
    plot_df_zh, x='umap_x', y='umap_y', color='language', opacity=0.2,
    hover_data={'url': True, 'title': True, 'similarity_mean': ':.4f'},
    title="Interactive UMAP of Chinese & English Embeddings (mBERT - Mean Pooling)"
)
fig_zh.update_layout(template='plotly_white', width=900, height=700)
fig_zh.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

