In [2]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

zeroshot_topic_list = ['searching knowledge', 'answer coding problem', 'summarizing', 'rephrasing', 'roleplay', 'translate', 'generate content']
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(
    min_topic_size=20,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=.25,
    vectorizer_model=vectorizer_model
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,494,answer coding problem,"[code, answer, number, snippet, given, functio...",[You are given a code snippet that contains a ...
1,1,412,summarizing,"[story, write, essay, use, text, paragraph, wr...",[Develop a detailed writing prompt that challe...
2,2,259,generate content,"[content, generate, create, html, write, websi...",[I Want You To Act As A Content Writer Very Pr...
3,3,253,searching knowledge,"[search, keyword, data, learning, list, ai, ke...",[Please ignore all previous instructions. I wa...
4,4,208,roleplay,"[play, character, team, game, like, players, o...",[ENTRY 2 \n\nVal's Early Life\n\n\*\*\n\nStayi...
...,...,...,...,...,...
61,61,22,53_css_attrx_attry_const,"[css, attrx, attry, const, 20px, marginbottom,...",[can you make a css design for this code\n\nMy...
62,62,21,54_slack_chat_credential_oauth,"[slack, chat, credential, oauth, app, chatbot,...",[I am looking to develop a highly sophisticate...
63,63,21,55_frac_mean_stock_numbers,"[frac, mean, stock, numbers, median, index1, w...","[explain how to calculate mean, mode and media..."
64,64,21,56_god_church_jehovah_leaders,"[god, church, jehovah, leaders, leadership, mi...",[Please summarize the following:\n\nThere was ...


In [1]:
import torch

torch.cuda.is_available()

True

### Downgrading numpy

To 1.22.4 because bertopic does not work with seed topic
https://github.com/MaartenGr/BERTopic/issues/1814

In [2]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

import numpy
print(f"Version: {numpy.__version__}")

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

vectorizer_model = CountVectorizer(stop_words="english")

seed_topic_list = [
                   ["when", 'date', 'time'],
                   ["who"],
                   ['where'],
                   ['how'],
                   ["rephrase", 'reword'],
                   ['translate'],
                   ["extract"],
                   ['code', 'coding', 'python'],
                   ['imagine', 'act', 'assume', 'role']
                  ]
# searching knowledge', 'answer coding problem', 'summarizing', 'rephrasing', 'roleplay', 'translate', 'generate content

topic_model = BERTopic(
    seed_topic_list=seed_topic_list,
    vectorizer_model=vectorizer_model,
    min_topic_size=25
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()


Version: 1.22.4


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3679,-1_code_function_write_use,"[code, function, write, use, using, python, sn...","[Web search results:\n\n[1] ""A ""listicle"" is a..."
1,0,314,0_time_date_null_varchar255,"[time, date, null, varchar255, day, pixel, yea...","[Ans: August 23 , 1994\nQues: On August 23, 19..."
2,1,310,1_proxysetheader_1111_english_translate,"[proxysetheader, 1111, english, translate, mea...",[You are currently an online teaching platform...
3,2,298,2_india_router_hibiscus_cache,"[india, router, hibiscus, cache, newwidth, cre...",[Given the text: Malvaceae week :: Hibiscus sa...
4,3,293,3_file_com_glass_extract,"[file, com, glass, extract, shaker, shot, cons...",[rearrange this data in table format that incl...
...,...,...,...,...,...
107,106,10,106_marketing_harvard_injaz_digital,"[marketing, harvard, injaz, digital, months, s...",[Kayvon.AI\n\nKayvon Kay's new AI allows sales...
108,107,10,107_matrix_var_matrix2_consolelogmatrixtologst...,"[matrix, var, matrix2, consolelogmatrixtologst...",[You are working with a codebase that includes...
109,108,10,108_programming_languages_specifcs_developergpt,"[programming, languages, specifcs, developergp...",[What's the hardest programming language to le...
110,109,10,109_encryption_salt_iv_encrypted,"[encryption, salt, iv, encrypted, decryption, ...",[How can I implement AES encryption algorithm ...


In [4]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3679,-1_code_function_write_use,"[code, function, write, use, using, python, sn...","[Web search results:\n\n[1] ""A ""listicle"" is a..."
1,0,314,0_time_date_null_varchar255,"[time, date, null, varchar255, day, pixel, yea...","[Ans: August 23 , 1994\nQues: On August 23, 19..."
2,1,310,1_proxysetheader_1111_english_translate,"[proxysetheader, 1111, english, translate, mea...",[You are currently an online teaching platform...
3,2,298,2_india_router_hibiscus_cache,"[india, router, hibiscus, cache, newwidth, cre...",[Given the text: Malvaceae week :: Hibiscus sa...
4,3,293,3_file_com_glass_extract,"[file, com, glass, extract, shaker, shot, cons...",[rearrange this data in table format that incl...
5,4,276,4_marketing_business_media_brand,"[marketing, business, media, brand, plan, stra...",[JOB ADVERTISEMENT\nMarketing Manager\nLocatio...
6,5,236,5_mui_said_vs_pm,"[mui, said, vs, pm, gymnasium, hs, monday, 730...",[Given the text: (A) 11/05/16 TBA vs Winston-S...
7,6,225,6_function_integers_numbers_integer,"[function, integers, numbers, integer, array, ...",[You are given a list of integers and are requ...
8,7,216,7_ingredients_recipe_food_cooking,"[ingredients, recipe, food, cooking, dish, flo...",[You will be presented with a document contain...
9,8,200,8_ev_june_ended_2017,"[ev, june, ended, 2017, waze, jones, months, n...",[Adena High School basketball went 6-16 during...
