# tiktoken exploration using the tiktoken library


In [3]:
# list of genZ words to explore
genz_words = [
    "lit",
    "fam",
    "vibe",
    "slay",
    "flex",
    "skrrt",
    "skibidi",
    "sigma",
    "delulu",
    "sus",
    "bussin",
    "cap",
]

In [4]:
# gpt-4 tokenizer
gpt4_tokenizer = tiktoken.encoding_for_model("gpt-4o")
# gpt-2 tokenizer
gpt2_tokenizer = tiktoken.get_encoding("gpt2")

# function to count tokens for a list of words
def count_tokens(words, tokenizer):
    return [len(tokenizer.encode(word)) for word in words]

In [5]:
# print token counts for gpt-4
print("GPT-4 Token Counts:")
for word, count in zip(genz_words, count_tokens(genz_words, gpt4_tokenizer)):
    print(f"{word}: {count} token(s)")
# print token counts for gpt-2
print("\nGPT-2 Token Counts:")
for word, count in zip(genz_words, count_tokens(genz_words, gpt2_tokenizer)):
    print(f"{word}: {count} token(s)")

GPT-4 Token Counts:
lit: 1 token(s)
fam: 1 token(s)
vibe: 2 token(s)
slay: 2 token(s)
flex: 1 token(s)
skrrt: 3 token(s)
skibidi: 3 token(s)
sigma: 1 token(s)
delulu: 2 token(s)
sus: 1 token(s)
bussin: 3 token(s)
cap: 1 token(s)

GPT-2 Token Counts:
lit: 1 token(s)
fam: 1 token(s)
vibe: 2 token(s)
slay: 2 token(s)
flex: 1 token(s)
skrrt: 3 token(s)
skibidi: 3 token(s)
sigma: 2 token(s)
delulu: 2 token(s)
sus: 2 token(s)
bussin: 3 token(s)
cap: 1 token(s)


In [13]:
trending_slang_terms = [
    "rizz",             # charisma / flirting ability
    "girl dinner",      # quirky meal made from snacks
    "delulu",           # delusional in a funny or hopeful way
    "beige flag",       # mildly weird dating trait
    "situationship",    # undefined romantic relationship
    "sigma",            # lone wolf masculinity type
    "main character",   # person acting like they’re in a movie
    "quiet luxury",     # expensive but understated fashion
    "npc",              # acting robotic or emotionless
    "no thoughts, head empty",  # blissfully unaware or relaxed
    "mid",              # average or underwhelming
    "feral",            # unhinged excitement or rage
    "canon event",      # inevitable event that shapes someone
    "ate and left no crumbs",  # performed extremely well
    "he's just a guy",  # minimizing idolization of a man
    "loud budgeting",   # openly refusing to overspend
    "chronically online", # out of touch with real life
    "corecore",         # emotional or aesthetic TikTok trend
    "ick",              # sudden repulsion in dating
    "male manipulator music",  # music associated with ironic red flags
    "Harvard travel ban",
    "liberalism is a mental disorder",
    "NPC energy",       # robotic or emotionless behavior
    "girlboss",         # empowered
]


In [14]:
# print token counts for gpt-4
print("GPT-4 Token Counts:")
for word, count in zip(trending_slang_terms, count_tokens(trending_slang_terms, gpt4_tokenizer)):
    print(f"{word}: {count} token(s)")
# print token counts for gpt-2
print("\nGPT-2 Token Counts:")
for word, count in zip(trending_slang_terms, count_tokens(trending_slang_terms, gpt2_tokenizer)):
    print(f"{word}: {count} token(s)")

# print differences in tokenization
print("\nDifferences in Tokenization:")
for word in genz_words + trending_slang_terms:
    gpt4_tokens = gpt4_tokenizer.encode(word)
    gpt2_tokens = gpt2_tokenizer.encode(word)
    if gpt4_tokens != gpt2_tokens:
        print(f"{word}: GPT-4: {len(gpt4_tokens)}, GPT-2: {len(gpt2_tokens)}")
    else:
        print(f"{word}: No difference in tokenization")

GPT-4 Token Counts:
rizz: 2 token(s)
girl dinner: 2 token(s)
delulu: 2 token(s)
beige flag: 3 token(s)
situationship: 2 token(s)
sigma: 1 token(s)
main character: 2 token(s)
quiet luxury: 2 token(s)
npc: 1 token(s)
no thoughts, head empty: 5 token(s)
mid: 1 token(s)
feral: 2 token(s)
canon event: 2 token(s)
ate and left no crumbs: 5 token(s)
he's just a guy: 5 token(s)
loud budgeting: 3 token(s)
chronically online: 3 token(s)
corecore: 2 token(s)
ick: 1 token(s)
male manipulator music: 4 token(s)
Harvard travel ban: 4 token(s)
liberalism is a mental disorder: 7 token(s)
NPC energy: 2 token(s)
girlboss: 2 token(s)

GPT-2 Token Counts:
rizz: 2 token(s)
girl dinner: 2 token(s)
delulu: 2 token(s)
beige flag: 3 token(s)
situationship: 3 token(s)
sigma: 2 token(s)
main character: 2 token(s)
quiet luxury: 2 token(s)
npc: 2 token(s)
no thoughts, head empty: 5 token(s)
mid: 1 token(s)
feral: 2 token(s)
canon event: 2 token(s)
ate and left no crumbs: 6 token(s)
he's just a guy: 5 token(s)
loud b