In [1]:
from google.colab import files
uploaded = files.upload()

INPUT_CSV = next(iter(uploaded.keys()))
print("Uploaded:", INPUT_CSV)

Saving zh7k5_with_en.csv to zh7k5_with_en.csv
Uploaded: zh7k5_with_en.csv


In [6]:
from google.colab import files
import io
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import umap
import plotly.express as px
df = pd.read_csv(io.BytesIO(uploaded[INPUT_CSV]))
print("DataFrame loaded.")
# df.info()


DataFrame loaded.


In [23]:
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer("sentence-transformers/LaBSE", device="cuda")

df = pd.read_csv("zh7k5_with_en.csv")
texts_zh = df["content"].fillna("").tolist()
texts_en = df["content_en"].fillna("").tolist()

# Generate embeddings
emb_zh = model.encode(texts_zh, batch_size=32, convert_to_numpy=True, show_progress_bar=True)
emb_en = model.encode(texts_en, batch_size=32, convert_to_numpy=True, show_progress_bar=True)

# save to dataframe
df["embedding_zh"] = [v.tolist() for v in emb_zh]
df["embedding_en"] = [v.tolist() for v in emb_en]

# parquet
df.to_parquet("zh7k5_with_en_labse.parquet", index=False)


Batches:   0%|          | 0/140 [00:00<?, ?it/s]

Batches:   0%|          | 0/140 [00:00<?, ?it/s]

In [26]:
# Calculate cosine similarity
numerator = np.sum(emb_zh * emb_en, axis=1)
denominator = np.linalg.norm(emb_zh, axis=1) * np.linalg.norm(emb_en, axis=1)
df["similarity_labse"] = numerator / (denominator + 1e-9)

# Add embeddings to DataFrame for saving
df["embedding_zh_labse"] = [v.tolist() for v in emb_zh]
df["embedding_en_labse"] = [v.tolist() for v in emb_en]

# Display Key Quantitative Results
sim = df["similarity_labse"]
print("\n--- Cosine Similarity Summary (LaBSE) ---")
print(sim.describe())

threshold = 0.75
count_below_threshold = (sim < threshold).sum()
print(f"\nTotal rows with similarity < {threshold}: {count_below_threshold} / {len(sim)}")
print("-----------------------------------------")

OUTPUT_FILENAME = f"{INPUT_CSV.split('.')[0]}_with_labse_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME}...")
df.to_parquet(OUTPUT_FILENAME, index=False)




--- Cosine Similarity Summary (LaBSE) ---
count    4458.000000
mean        0.123237
std         0.119383
min        -0.103340
25%         0.070672
50%         0.110989
75%         0.144808
max         0.949043
Name: similarity_labse, dtype: float64

Total rows with similarity < 0.75: 4375 / 4458
-----------------------------------------

Saving results to zh7k5_with_en_with_labse_embeddings.parquet...


In [27]:
emb_zh = np.array(df["embedding_zh"].tolist())
emb_en = np.array(df["embedding_en"].tolist())

# 1. Combine all embeddings into a single dataset
all_emb = np.concatenate([emb_zh, emb_en], axis=0)

# 2. Run UMAP once on the combined data
reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)
reduced = reducer.fit_transform(all_emb)

# 3. Split the results back into Chinese and English coordinates for plotting
n = len(df)
zh_2d = reduced[:n]
en_2d = reduced[n:]
print("UMAP process complete.")


# --- Prepare DataFrame for the Interactive Plotly Plot ---
print("Preparing data for plotting...")
# This DataFrame will contain all the necessary data for plotting and hovers.
plot_df = pd.DataFrame({
    'umap_x': np.concatenate([zh_2d[:, 0], en_2d[:, 0]]),
    'umap_y': np.concatenate([zh_2d[:, 1], en_2d[:, 1]]),
    'language': ['Chinese'] * n + ['English'] * n,
    # Duplicate metadata from the original df for both sets of points
    'url': pd.concat([df['url'], df['url']]).tolist(),
    'title': pd.concat([df['title'], df['title']]).tolist(),
    'similarity_labse': pd.concat([df['similarity_labse'], df['similarity_labse']]).tolist()
})


# --- Generate the Final Interactive Plot ---
print("Generating interactive plot...")
fig = px.scatter(
    plot_df,
    x='umap_x',
    y='umap_y',
    color='language',  # Colors points by 'Chinese' or 'English'
    opacity=0.2,       # Set uniform alpha as requested
    hover_data={       # Configure hover tooltip
        'url': True,
        'title': True,
        'similarity_labse': ':.4f', # Format score to 4 decimals
        # Hide unnecessary data from the hover box
        'language': False,
        'umap_x': False,
        'umap_y': False,
    },
    title="Interactive UMAP of Chinese & English Embeddings (LaBSE)"
)

# Update layout for a clean, white background and better readability
fig.update_layout(
    template='plotly_white', # Use a white background theme as requested
    width=900,
    height=700,
    legend_title_text='Language',
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2"
)
fig.update_traces(marker=dict(size=5))
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP process complete.
Preparing data for plotting...
Generating interactive plot...


In [28]:
uploaded_es = files.upload()


Saving es7k5_with_en.csv to es7k5_with_en.csv


In [29]:
INPUT_CSV_ES = next(iter(uploaded_es.keys()))
print(f"Successfully uploaded: {INPUT_CSV_ES}")
df_es = pd.read_csv(io.BytesIO(uploaded_es[INPUT_CSV_ES]))
print("Spanish DataFrame loaded.")

Successfully uploaded: es7k5_with_en.csv
Spanish DataFrame loaded.


In [30]:
texts_es = df_es["content"].fillna("").astype(str).tolist()
texts_en_from_es = df_es["content_en"].fillna("").astype(str).tolist()

# Generate embeddings
print("Generating embeddings for Spanish texts...")
emb_es = model.encode(texts_es, batch_size=32, show_progress_bar=True)

print("Generating embeddings for their English translations...")
emb_en_from_es = model.encode(texts_en_from_es, batch_size=32, show_progress_bar=True)


Generating embeddings for Spanish texts...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

Generating embeddings for their English translations...


Batches:   0%|          | 0/198 [00:00<?, ?it/s]

In [31]:
# Calculate cosine similarity
numerator_es = np.sum(emb_es * emb_en_from_es, axis=1)
denominator_es = np.linalg.norm(emb_es, axis=1) * np.linalg.norm(emb_en_from_es, axis=1)
df_es["similarity_labse"] = numerator_es / (denominator_es + 1e-9)

# Add embeddings to DataFrame for saving
df_es["embedding_es"] = [v.tolist() for v in emb_es]
df_es["embedding_en"] = [v.tolist() for v in emb_en_from_es]

# --- Display Key Quantitative Results for Spanish Data ---
sim_es = df_es["similarity_labse"]
print("\n--- Cosine Similarity Summary (LaBSE on Spanish) ---")
print(sim_es.describe())

threshold = 0.75
count_below_threshold_es = (sim_es < threshold).sum()
print(f"\nTotal rows with similarity < {threshold}: {count_below_threshold_es} / {len(sim_es)}")
print("----------------------------------------------------")

# Save results to a new Parquet file
OUTPUT_FILENAME_ES = f"{INPUT_CSV_ES.split('.')[0]}_with_labse_embeddings.parquet"
print(f"\nSaving results to {OUTPUT_FILENAME_ES}...")
df_es.to_parquet(OUTPUT_FILENAME_ES, index=False)
print("Save complete.")



--- Cosine Similarity Summary (LaBSE on Spanish) ---
count    6336.000000
mean        0.825364
std         0.065956
min         0.154070
25%         0.788380
50%         0.828545
75%         0.865217
max         1.000000
Name: similarity_labse, dtype: float64

Total rows with similarity < 0.75: 680 / 6336
----------------------------------------------------

Saving results to es7k5_with_en_with_labse_embeddings.parquet...
Save complete.


In [32]:
print("\nStarting UMAP process for Spanish visualization...")

# --- UMAP Calculation (Using the correct 'concatenate' method) ---
all_emb_es = np.concatenate([emb_es, emb_en_from_es], axis=0)

reducer_es = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
reduced_es = reducer_es.fit_transform(all_emb_es)

n_es = len(df_es)
es_2d = reduced_es[:n_es]
en_from_es_2d = reduced_es[n_es:]
print("UMAP process complete.")


# --- Prepare DataFrame for the Interactive Plotly Plot ---
print("Preparing data for plotting...")
plot_df_es = pd.DataFrame({
    'umap_x': np.concatenate([es_2d[:, 0], en_from_es_2d[:, 0]]),
    'umap_y': np.concatenate([es_2d[:, 1], en_from_es_2d[:, 1]]),
    'language': ['Spanish'] * n_es + ['English'] * n_es,
    'url': pd.concat([df_es['url'], df_es['url']]).tolist(),
    'title': pd.concat([df_es['title'], df_es['title']]).tolist(),
    'similarity_labse': pd.concat([df_es['similarity_labse'], df_es['similarity_labse']]).tolist()
})


# --- Generate the Final Interactive Plot ---
print("Generating interactive plot...")
fig_es = px.scatter(
    plot_df_es,
    x='umap_x',
    y='umap_y',
    color='language',
    opacity=0.2,
    hover_data={
        'url': True,
        'title': True,
        'similarity_labse': ':.4f',
        'language': False,
        'umap_x': False,
        'umap_y': False,
    },
    title="Interactive UMAP of Spanish & English Embeddings (LaBSE)"
)

fig_es.update_layout(
    template='plotly_white',
    width=900,
    height=700,
    legend_title_text='Language',
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2"
)
fig_es.update_traces(marker=dict(size=5))
fig_es.show()


Starting UMAP process for Spanish visualization...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP process complete.
Preparing data for plotting...
Generating interactive plot...
