# LNG320 Gen Z Slang Similarity Analysis


In [None]:
%pip install -qU datasets pinecone-client "langchain==0.3.27" "langchain-core>=0.3.72,<1.0.0" langchain-pinecone umap-learn scikit-learn plotly tqdm "threadpoolctl==3.5.0"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m34.1 MB/s[0m et

In [2]:
import os
from getpass import getpass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
from datasets import load_dataset
from IPython.display import display
from pinecone import Pinecone
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN

import plotly.express as px

try:
    import umap  # type: ignore
except ImportError:  # pragma: no cover
    import umap.umap_ as umap  # fallback if namespace layout differs


In [None]:
if "PINECONE_API_KEY" not in os.environ or not os.environ["PINECONE_API_KEY"]:
    os.environ["PINECONE_API_KEY"] = getpass("Enter your Pinecone API key: ")

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])


## Load and Prepare Dataset


In [None]:
raw_ds = load_dataset("MLBtrio/genz-slang-dataset", split="train")
df = raw_ds.to_pandas().copy()

df.columns = [col.lower().strip() for col in df.columns]
slang_column = "slang"

df["input_for_embedding"] = (
    df[slang_column].astype(str)
    + " is a slang term that means "
    + df["description"].astype(str)
)

display(df.head())
print(f"Dataset shape: {df.shape}")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

all_slangs.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1779 [00:00<?, ? examples/s]

Unnamed: 0,slang,description,example,context,input_for_embedding
0,W,Shorthand for win,"Got the job today, big W!",Typically used in conversations to celebrate s...,W is a slang term that means Shorthand for win
1,L,Shorthand for loss/losing,"I forgot my wallet at home, that’s an L.",Often used when referring to a failure or mish...,L is a slang term that means Shorthand for los...
2,L+ratio,Response to a comment or action on the interne...,Your tweet got 5 likes and 100 replies calling...,Popularized on social media platforms to signi...,L+ratio is a slang term that means Response to...
3,Dank,excellent or of very high quality,That meme is so dank!,Commonly used in internet slang to refer to me...,Dank is a slang term that means excellent or o...
4,Cheugy,Derogatory term for Millennials. Used when mil...,"That phrase is so cheugy, no one says that any...",Used to refer to things that were once popular...,Cheugy is a slang term that means Derogatory t...


Dataset shape: (1779, 5)


In [5]:
import uuid

df["id"] = [
    str(uuid.uuid5(uuid.NAMESPACE_DNS, term)) for term in df[slang_column].astype(str)
]
print("Sample IDs:")
display(df[["slang", "id"]].head())


Sample IDs:


Unnamed: 0,slang,id
0,W,de7bed5e-22b7-516e-abdd-b6702d14f2c2
1,L,d0f78763-dafa-50e6-98cf-80e37b9ff0ac
2,L+ratio,acc7c7d5-9aaa-5d7d-9176-c8963f4ed424
3,Dank,99ca01c1-e02d-5c30-93f2-843624ab74bd
4,Cheugy,ea5729af-abec-59be-9fa5-2fb1403d92a1


## Pinecone Index Setup


In [6]:
index_name = "lng320-genz-slang"

existing_indexes = {item["name"] for item in pc.list_indexes()}
if index_name not in existing_indexes:
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map": {"text": "input_for_embedding"},
        },
    )
    print(f"Created index '{index_name}'")
else:
    print(f"Using existing index '{index_name}'")

index = pc.Index(index_name)


Using existing index 'lng320-genz-slang'


## Generate or Retrieve Embeddings


In [None]:
GENERATE_EMBEDDINGS = False

if GENERATE_EMBEDDINGS:
    texts = df["input_for_embedding"].tolist()
    batch_size = 96
    embeddings: List[List[float]] = []

    for start in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[start : start + batch_size]
        embed_result = pc.inference.embed(
            model="llama-text-embed-v2",
            inputs=batch,
            parameters={"input_type": "passage"},
        )
        batch_embeddings = [item.values for item in embed_result.data]
        embeddings.extend(batch_embeddings)

    df["values"] = embeddings
    print(f"Generated {len(df)} embeddings")
else:
    fetched_vectors: Dict[str, List[float]] = {}
    batch_size = 200
    for start in tqdm(range(0, len(df), batch_size), desc="Fetching embeddings"):
        batch_ids = df["id"].iloc[start : start + batch_size].tolist()
        response = index.fetch(ids=batch_ids)
        fetched_vectors.update(
            {item[0]: item[1]["values"] for item in response.vectors.items()}
        )

    df["values"] = [fetched_vectors[row.id] for row in df.itertuples(index=False)]
    missing = [
        row.id for row in df.itertuples(index=False) if row.id not in fetched_vectors
    ]
    if missing:
        raise RuntimeError(
            f"Missing vectors for {len(missing)} ids. Regenerate embeddings instead."
        )


Fetching embeddings: 100%|██████████| 9/9 [00:06<00:00,  1.31it/s]


In [8]:
if GENERATE_EMBEDDINGS:
    vectors = [
        {
            "id": row.id,
            "values": row.values,
            "metadata": {"text": row.input_for_embedding},
        }
        for row in df.itertuples(index=False)
    ]
    batch_size = 100
    for start in tqdm(range(0, len(vectors), batch_size), desc="Upserting to Pinecone"):
        index.upsert(vectors=vectors[start : start + batch_size])
    print("Upserted embeddings to Pinecone")


In [9]:
embedding_matrix = np.vstack(df["values"].to_numpy())
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (1779, 1024)


## Similarity Measurement Configuration


In [10]:
def embed_texts(texts: List[str]) -> np.ndarray:
    """Embed arbitrary strings with the project model."""
    result = pc.inference.embed(
        model="llama-text-embed-v2",
        inputs=texts,
        parameters={"input_type": "query"},
    )
    return np.vstack([item.values for item in result.data])


def cosine_similarity_for_terms(terms: List[str]) -> pd.DataFrame:
    """Compute cosine similarity between provided slang terms (case-insensitive)."""
    lowercase_terms = [term.lower() for term in terms]
    rows = df[df[slang_column].str.lower().isin(lowercase_terms)]
    if len(rows) != len(terms):
        missing = set(lowercase_terms) - set(rows[slang_column].str.lower())
        raise ValueError(f"Missing slang terms in dataset: {missing}")

    vectors = np.vstack(rows["values"].to_numpy())
    sim_matrix = cosine_similarity(vectors)
    return pd.DataFrame(
        sim_matrix,
        index=rows[slang_column].tolist(),
        columns=rows[slang_column].tolist(),
    )


sample_terms = ["W", "L", "Cheugy", "Dank"]
cosine_df = cosine_similarity_for_terms(sample_terms)
display(cosine_df.round(3))


Unnamed: 0,W,L,Dank,Cheugy
W,1.0,0.801,0.562,0.303
L,0.801,1.0,0.501,0.294
Dank,0.562,0.501,1.0,0.303
Cheugy,0.303,0.294,0.303,1.0


In [11]:
query_string = "an enthusiastic expression of success"
query_vector = embed_texts([query_string])
existing_vectors = cosine_similarity(query_vector, embedding_matrix).flatten()

nearest_idx = np.argsort(existing_vectors)[::-1][:5]
results = df.iloc[nearest_idx][["slang", "description"]].copy()
results["cosine_similarity"] = existing_vectors[nearest_idx]
display(results)


Unnamed: 0,slang,description,cosine_similarity
107,Yas,A very enthusiastic and celebratory way of say...,0.311241
1738,YEET,approval or display of energy,0.260336
115,Poggers,an expression that conveys excitement,0.256329
61,I oop,"Used to express shock, embarrassment, and or a...",0.24505
79,Vibing,describes a generic positive feeling that some...,0.227551


## Nearest Neighbor Analysis


In [12]:
def pinecone_top_k(vector: List[float], top_k: int = 5) -> pd.DataFrame:
    response = index.query(vector=vector, top_k=top_k, include_metadata=True)
    matches = response.get("matches", [])
    records = [
        {
            "id": match["id"],
            "score": match["score"],
            "text": match["metadata"].get("text"),
        }
        for match in matches
    ]
    return pd.DataFrame(records)


def nearest_neighbors_for_slang(term: str, top_k: int = 5) -> pd.DataFrame:
    row = df[df[slang_column].str.lower() == term.lower()]
    if row.empty:
        raise ValueError(f"Slang term '{term}' not found")
    vector = row.iloc[0]["values"]
    return pinecone_top_k(vector, top_k=top_k)


def nearest_neighbors_for_text(text: str, top_k: int = 5) -> pd.DataFrame:
    vector = embed_texts([text])[0].tolist()
    return pinecone_top_k(vector, top_k=top_k)


sample_neighbors = nearest_neighbors_for_slang("W", top_k=5)
display(sample_neighbors)


Unnamed: 0,id,score,text
0,de7bed5e-22b7-516e-abdd-b6702d14f2c2,1.000053,W is a slang term that means Shorthand for win
1,98f01c91-19d5-528f-adcf-afa68a43f3b0,0.8218,*w* is a slang term that means wink
2,1ed7239e-bd59-5fca-8e47-6ada12c27cba,0.81067,R is a slang term that means are
3,912540b4-4a3e-5fbd-bea0-1e1da48b5fb5,0.809025,U is a slang term that means You
4,4d6dbbfc-2a3e-56bf-9bd9-ce74bea036c8,0.802785,W/ is a slang term that means With


In [13]:
query_neighbors = nearest_neighbors_for_text("shorthand for losing", top_k=5)
display(query_neighbors)


Unnamed: 0,id,score,text
0,d0f78763-dafa-50e6-98cf-80e37b9ff0ac,0.446528,L is a slang term that means Shorthand for los...
1,ee14345b-db5f-5d3d-a2af-f1ff9153268f,0.344265,YHL is a slang term that means You have lost
2,f92945c0-18cc-519e-8881-396e89aa593a,0.315539,FTL is a slang term that means For the loss
3,10324f63-625c-58cc-9763-ee42ff6e7411,0.311738,LNT is a slang term that means Meaning lost in...
4,8e300a04-37a3-5621-b54f-6c728b144a2a,0.286572,YWSYLS is a slang term that means You win some...


## Dimensionality Reduction and Visualization


In [14]:
tsne = TSNE(
    n_components=2,
    metric="cosine",
    learning_rate="auto",
    init="random",
    random_state=42,
)
tsne_coords = tsne.fit_transform(embedding_matrix)

umap_model = umap.UMAP(n_components=2, metric="cosine", random_state=42)
umap_coords = umap_model.fit_transform(embedding_matrix)

viz_df = df[["slang", "description"]].copy()
viz_df[["tsne_x", "tsne_y"]] = tsne_coords
viz_df[["umap_x", "umap_y"]] = umap_coords

viz_df.head()


  warn(


Unnamed: 0,slang,description,tsne_x,tsne_y,umap_x,umap_y
0,W,Shorthand for win,15.107715,-1.864938,4.622261,0.004025
1,L,Shorthand for loss/losing,15.765772,-1.232628,4.785416,0.533991
2,L+ratio,Response to a comment or action on the interne...,-64.47789,16.419079,2.922814,2.855519
3,Dank,excellent or of very high quality,-10.960285,50.223122,5.730495,1.413469
4,Cheugy,Derogatory term for Millennials. Used when mil...,-51.811977,18.637838,4.315547,3.304321


In [15]:
fig_tsne = px.scatter(
    viz_df,
    x="tsne_x",
    y="tsne_y",
    hover_data={"slang": True, "description": True},
    title="t-SNE Visualization of Gen Z Slang Embeddings",
    width=900,
    height=600,
)
fig_tsne.show()


In [16]:
fig_umap = px.scatter(
    viz_df,
    x="umap_x",
    y="umap_y",
    hover_data={"slang": True, "description": True},
    title="UMAP Visualization of Gen Z Slang Embeddings",
    width=900,
    height=600,
)
fig_umap.show()


## Clustering Analysis


In [17]:
kmeans_clusters = 15
kmeans = KMeans(
    n_clusters=kmeans_clusters, init="k-means++", random_state=42, n_init=10
)
kmeans_labels = kmeans.fit_predict(embedding_matrix)

dbscan = DBSCAN(eps=0.3, min_samples=5, metric="cosine")
dbscan_labels = dbscan.fit_predict(embedding_matrix)

viz_df["kmeans_cluster"] = kmeans_labels
viz_df["dbscan_cluster"] = dbscan_labels

kmeans_summary = (
    viz_df.groupby("kmeans_cluster")
    .agg({"slang": "count"})
    .rename(columns={"slang": "count"})
)
dbscan_summary = (
    viz_df.groupby("dbscan_cluster")
    .agg({"slang": "count"})
    .rename(columns={"slang": "count"})
)

print("K-Means cluster counts:")
display(kmeans_summary.sort_values("count", ascending=False))

print("DBSCAN cluster counts (-1 = noise):")
display(dbscan_summary.sort_values("count", ascending=False))


Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7d0a1dfeed40>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.12/dist-packages/threadpoolctl.py", line 1175, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/ctypes/__init__.py", line 379, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


K-Means cluster counts:


Unnamed: 0_level_0,count
kmeans_cluster,Unnamed: 1_level_1
5,305
0,178
9,155
10,136
8,129
4,122
3,119
1,101
6,101
7,100


DBSCAN cluster counts (-1 = noise):


Unnamed: 0_level_0,count
dbscan_cluster,Unnamed: 1_level_1
0,1430
-1,339
1,5
2,5


In [18]:
fig_umap_kmeans = px.scatter(
    viz_df,
    x="umap_x",
    y="umap_y",
    color=viz_df["kmeans_cluster"].astype(str),
    hover_data={"slang": True, "description": True},
    title="UMAP with K-Means Cluster Assignments",
    width=900,
    height=600,
)
fig_umap_kmeans.show()

fig_umap_dbscan = px.scatter(
    viz_df,
    x="umap_x",
    y="umap_y",
    color=viz_df["dbscan_cluster"].astype(str),
    hover_data={"slang": True, "description": True},
    title="UMAP with DBSCAN Cluster Assignments",
    width=900,
    height=600,
)
fig_umap_dbscan.show()


In [19]:
def sample_cluster_terms(
    cluster_column: str, cluster_id: int, sample_size: int = 10
) -> pd.DataFrame:
    cluster_rows = viz_df[viz_df[cluster_column] == cluster_id]
    return cluster_rows[["slang", "description"]].head(sample_size)


sample_cluster_terms("kmeans_cluster", 0)


Unnamed: 0,slang,description
66,NPC,Someone who cannot think for themselves and/or...
94,Low key,Secretly
145,AFK,Away From Keyboard
146,ASAP,As Soon As Possible
163,FAQ,Frequently Asked Questions
178,IOW,In Other Words
240,@TEOTD,At the end of the day
243,1UP,Meaning extra life
245,1337,"Leet, meaning ‘elite’"
302,AAMOI,As a matter of interest


## Next Steps and Notes


## Export CSV Files


In [None]:
import os
from pathlib import Path

output_dir = Path("csv_exports")
output_dir.mkdir(exist_ok=True)

print("Exporting CSV files for analysis...")
print(f"Output directory: {output_dir.absolute()}\n")

main_dataset = df[["id", "slang", "description", "input_for_embedding"]].copy()
main_dataset_path = output_dir / "genz_slang_main_dataset.csv"
main_dataset.to_csv(main_dataset_path, index=False, encoding="utf-8")
print(f"✓ Exported main dataset: {main_dataset_path} ({len(main_dataset)} rows)")

viz_clusters_path = output_dir / "genz_slang_visualization_clusters.csv"
viz_df.to_csv(viz_clusters_path, index=False, encoding="utf-8")
print(f"✓ Exported visualization & clusters: {viz_clusters_path} ({len(viz_df)} rows)")

print("\nComputing full pairwise similarity matrix...")
full_similarity_matrix = cosine_similarity(embedding_matrix)
similarity_df = pd.DataFrame(
    full_similarity_matrix,
    index=df[slang_column].tolist(),
    columns=df[slang_column].tolist(),
)
similarity_path = output_dir / "genz_slang_pairwise_similarity.csv"
similarity_df.to_csv(similarity_path, encoding="utf-8")
print(
    f"✓ Exported pairwise similarity matrix: {similarity_path} ({similarity_df.shape[0]}x{similarity_df.shape[1]})"
)

print("\nComputing nearest neighbors for all terms...")
top_k = 10
nearest_neighbors_records = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing terms"):
    source_slang = row[slang_column]
    source_vector = row["values"]

    similarities = cosine_similarity([source_vector], embedding_matrix).flatten()

    top_indices = np.argsort(similarities)[::-1][: top_k + 1]
    top_indices = top_indices[similarities[top_indices] < 0.9999][:top_k]

    for rank, target_idx in enumerate(top_indices, start=1):
        nearest_neighbors_records.append(
            {
                "source_slang": source_slang,
                "target_slang": df.iloc[target_idx][slang_column],
                "similarity_score": float(similarities[target_idx]),
                "rank": rank,
            }
        )

nearest_neighbors_df = pd.DataFrame(nearest_neighbors_records)
nearest_neighbors_path = output_dir / "genz_slang_nearest_neighbors.csv"
nearest_neighbors_df.to_csv(nearest_neighbors_path, index=False, encoding="utf-8")
print(
    f"✓ Exported nearest neighbors: {nearest_neighbors_path} ({len(nearest_neighbors_df)} rows)"
)

cluster_summaries_records = []

for cluster_id in sorted(viz_df["kmeans_cluster"].unique()):
    cluster_rows = viz_df[viz_df["kmeans_cluster"] == cluster_id]
    sample_terms = ", ".join(cluster_rows["slang"].head(10).tolist())
    cluster_summaries_records.append(
        {
            "cluster_type": "kmeans",
            "cluster_id": int(cluster_id),
            "term_count": len(cluster_rows),
            "sample_terms": sample_terms,
        }
    )

for cluster_id in sorted(viz_df["dbscan_cluster"].unique()):
    cluster_rows = viz_df[viz_df["dbscan_cluster"] == cluster_id]
    sample_terms = ", ".join(cluster_rows["slang"].head(10).tolist())
    cluster_summaries_records.append(
        {
            "cluster_type": "dbscan",
            "cluster_id": int(cluster_id),
            "term_count": len(cluster_rows),
            "sample_terms": sample_terms,
        }
    )

cluster_summaries_df = pd.DataFrame(cluster_summaries_records)
cluster_summaries_path = output_dir / "genz_slang_cluster_summaries.csv"
cluster_summaries_df.to_csv(cluster_summaries_path, index=False, encoding="utf-8")
print(
    f"✓ Exported cluster summaries: {cluster_summaries_path} ({len(cluster_summaries_df)} rows)"
)

print("\nExporting embeddings...")
embeddings_df = pd.DataFrame(
    embedding_matrix,
    index=df[slang_column].tolist(),
    columns=[f"embed_dim_{i}" for i in range(embedding_matrix.shape[1])],
)
embeddings_df.insert(0, "slang", embeddings_df.index)
embeddings_df.reset_index(drop=True, inplace=True)
embeddings_path = output_dir / "genz_slang_embeddings.csv"
embeddings_df.to_csv(embeddings_path, index=False, encoding="utf-8")
print(
    f"✓ Exported embeddings: {embeddings_path} ({len(embeddings_df)} rows, {embedding_matrix.shape[1]} dimensions)"
)

print("\n" + "=" * 60)
print("All CSV files exported successfully!")
print("=" * 60)


In [None]:
import zipfile
from pathlib import Path

zip_path = Path("genz_slang_analysis_csvs.zip")
output_dir = Path("csv_exports")

if zip_path.exists():
    zip_path.unlink()

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for csv_file in output_dir.glob("*.csv"):
        zipf.write(csv_file, arcname=csv_file.name)
        print(f"Added to zip: {csv_file.name}")

print(f"\n✓ Created zip archive: {zip_path.absolute()}")
print(f"  Archive size: {zip_path.stat().st_size / (1024 * 1024):.2f} MB")
