In [1]:
import os
from getpass import getpass
import uuid

from IPython.display import display
from datasets import load_dataset
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
from openai import OpenAI

load_dotenv()

True

In [2]:
ds = load_dataset("MLBtrio/genz-slang-dataset", split="train")
df = ds.to_pandas()

print(df.head())
print("\nDataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())

     Slang                                        Description  \
0        W                                  Shorthand for win   
1        L                          Shorthand for loss/losing   
2  L+ratio  Response to a comment or action on the interne...   
3     Dank                  excellent or of very high quality   
4   Cheugy  Derogatory term for Millennials. Used when mil...   

                                             Example  \
0                          Got the job today, big W!   
1           I forgot my wallet at home, that’s an L.   
2  Your tweet got 5 likes and 100 replies calling...   
3                              That meme is so dank!   
4  That phrase is so cheugy, no one says that any...   

                                             Context  
0  Typically used in conversations to celebrate s...  
1  Often used when referring to a failure or mish...  
2  Popularized on social media platforms to signi...  
3  Commonly used in internet slang to refer to me...

In [3]:
df.columns = [c.lower().strip() for c in df.columns]
slang_col = "slang"

In [4]:
df["input_for_embedding"] = (
    df[slang_col].astype(str) + " is a slang term that means " + df["description"]
)

In [5]:
display(df["input_for_embedding"])

0          W is a slang term that means Shorthand for win
1       L is a slang term that means Shorthand for los...
2       L+ratio is a slang term that means Response to...
3       Dank is a slang term that means excellent or o...
4       Cheugy is a slang term that means Derogatory t...
                              ...                        
1774          ZH is a slang term that means Sleeping Hour
1775            ZOMG is a slang term that means Oh My God
1776        ZOT is a slang term that means Zero tolerance
1777            ZUP is a slang term that means What’s up?
1778             ZZZZ is a slang term that means Sleeping
Name: input_for_embedding, Length: 1779, dtype: object

## EMBEDDING TO VECTOR SPACE


In [6]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass(
    "Enter your Pinecone API key: "
)

In [7]:
os.environ["OPENROUTER_API_KEY"] = os.getenv("OPENROUTER_API_KEY") or getpass(
    "Enter your OpenRouter API key: "
)

In [8]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [9]:
openrouter = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["OPENROUTER_API_KEY"],
)


In [10]:
index_name = "lng320-genz-slang-qwen3-8b"

if not any(idx["name"] == index_name for idx in pc.list_indexes()):
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    print(f"Created index '{index_name}'")


In [11]:
index = pc.Index(index_name)

In [12]:
df["id"] = [str(uuid.uuid5(uuid.NAMESPACE_DNS, str(s))) for s in df["slang"]]

In [14]:
all_ids = df["id"].tolist()
existing_ids: set[str] = set()

batch_size = 100
for i in range(0, len(all_ids), batch_size):
    batch_ids = all_ids[i : i + batch_size]
    fetch_response = index.fetch(ids=batch_ids)
    existing_ids.update(fetch_response.vectors.keys())

print(f"Found {len(existing_ids)} existing vectors, skipping duplicates...")

skipped = 0
for row in tqdm(
    df.itertuples(index=False), desc="Embedding & Upserting", total=len(df)
):
    if row.id in existing_ids:
        skipped += 1
        continue

    response = openrouter.embeddings.create(
        model="qwen/qwen3-embedding-8b",
        input=row.input_for_embedding,
    )
    vector = {
        "id": row.id,
        "values": response.data[0].embedding,
        "metadata": {
            "slang": row.slang,
            "description": row.description,
            "example": row.example,
            "context": row.context,
            "text": row.input_for_embedding,
        },
    }
    index.upsert(vectors=[vector])

print(f"Done. Skipped {skipped} duplicates, upserted {len(df) - skipped} new vectors.")

Found 1113 existing vectors, skipping duplicates...


Embedding & Upserting: 100%|██████████| 1779/1779 [16:48<00:00,  1.76it/s] 

Done. Skipped 1260 duplicates, upserted 519 new vectors.



