In [2]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

zeroshot_topic_list = [
    "searching knowledge",
    "answer coding problem",
    "summarizing",
    "rephrasing",
    "roleplay",
    "translate",
    "generate content",
]
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(
    min_topic_size=20,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=0.25,
    vectorizer_model=vectorizer_model,
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,494,answer coding problem,"[code, answer, number, snippet, given, functio...",[You are given a code snippet that contains a ...
1,1,412,summarizing,"[story, write, essay, use, text, paragraph, wr...",[Develop a detailed writing prompt that challe...
2,2,259,generate content,"[content, generate, create, html, write, websi...",[I Want You To Act As A Content Writer Very Pr...
3,3,253,searching knowledge,"[search, keyword, data, learning, list, ai, ke...",[Please ignore all previous instructions. I wa...
4,4,208,roleplay,"[play, character, team, game, like, players, o...",[ENTRY 2 \n\nVal's Early Life\n\n\*\*\n\nStayi...
...,...,...,...,...,...
61,61,22,53_css_attrx_attry_const,"[css, attrx, attry, const, 20px, marginbottom,...",[can you make a css design for this code\n\nMy...
62,62,21,54_slack_chat_credential_oauth,"[slack, chat, credential, oauth, app, chatbot,...",[I am looking to develop a highly sophisticate...
63,63,21,55_frac_mean_stock_numbers,"[frac, mean, stock, numbers, median, index1, w...","[explain how to calculate mean, mode and media..."
64,64,21,56_god_church_jehovah_leaders,"[god, church, jehovah, leaders, leadership, mi...",[Please summarize the following:\n\nThere was ...


In [1]:
import torch

torch.cuda.is_available()

True

### Downgrading numpy

To 1.22.4 because bertopic does not work with seed topic
https://github.com/MaartenGr/BERTopic/issues/1814

In [5]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

import numpy

print(f"Version: {numpy.__version__}")

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

vectorizer_model = CountVectorizer(stop_words="english")

seed_topic_list = [
    ["when", "date", "time"],
    ["who"],
    ["where"],
    ["how"],
    ["rephrase", "reword"],
    ["translate"],
    ["extract"],
    ["code", "coding", "python"],
    ["imagine", "act", "assume", "role"],
]
# searching knowledge', 'answer coding problem', 'summarizing', 'rephrasing', 'roleplay', 'translate', 'generate content

topic_model = BERTopic(
    seed_topic_list=seed_topic_list,
    vectorizer_model=vectorizer_model,
    min_topic_size=25,
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Version: 1.22.4


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3239,-1_use_like_provide_write,"[use, like, provide, write, using, data, creat...","[Web search results:\n\n[1] ""A ""listicle"" is a..."
1,0,1244,0_code_function_python_snippet,"[code, function, python, snippet, file, input,...",[You are given a code snippet that aims to dis...
2,1,314,1_proxysetheader_english_1111_language,"[proxysetheader, english, 1111, language, mean...","[what is English language, $LANGUAGE = French\..."
3,2,312,2_time_date_null_day,"[time, date, null, day, varchar255, answer, ye...",[Student A:Passage: Rainfall is the most commo...
4,3,307,3_file_com_glass_extract,"[file, com, glass, extract, shaker, const, sho...",[rearrange this data in table format that incl...
5,4,287,4_india_router_hibiscus_cache,"[india, router, hibiscus, cache, credit, newwi...",[Here is a piece of text: Age-related macular ...
6,5,256,5_story_character_characters_write,"[story, character, characters, write, plot, bo...","[tell me a short story, Please provided a brie..."
7,6,250,6_marketing_business_brand_media,"[marketing, business, brand, media, plan, stra...",[Given the text: Are you a Consumer Products G...
8,7,243,7_said_mui_vs_pm,"[said, mui, vs, pm, did, gymnasium, monday, hs...",[A suspect has died of an apparent self-inflic...
9,8,238,8_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Research the benefits and drawbacks of nuclea...


In [11]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3239,-1_use_like_provide_write,"[use, like, provide, write, using, data, creat...","[Web search results:\n\n[1] ""A ""listicle"" is a..."
1,0,1244,0_code_function_python_snippet,"[code, function, python, snippet, file, input,...",[You are given a code snippet that aims to dis...
2,1,314,1_proxysetheader_english_1111_language,"[proxysetheader, english, 1111, language, mean...","[what is English language, $LANGUAGE = French\..."
3,2,312,2_time_date_null_day,"[time, date, null, day, varchar255, answer, ye...",[Student A:Passage: Rainfall is the most commo...
4,3,307,3_file_com_glass_extract,"[file, com, glass, extract, shaker, const, sho...",[rearrange this data in table format that incl...
5,4,287,4_india_router_hibiscus_cache,"[india, router, hibiscus, cache, credit, newwi...",[Here is a piece of text: Age-related macular ...
6,5,256,5_story_character_characters_write,"[story, character, characters, write, plot, bo...","[tell me a short story, Please provided a brie..."
7,6,250,6_marketing_business_brand_media,"[marketing, business, brand, media, plan, stra...",[Given the text: Are you a Consumer Products G...
8,7,243,7_said_mui_vs_pm,"[said, mui, vs, pm, did, gymnasium, monday, hs...",[A suspect has died of an apparent self-inflic...
9,8,238,8_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Research the benefits and drawbacks of nuclea...


In [10]:
topic_model.get_representative_docs()

{-1: ['Web search results:\n\n[1] "A "listicle" is an article made of a list, usually with some kind of extra detail to each item. The format, however, is flexible. ... (a heavily sourced list of information) Reported list (more of an article turned into a list to make it scannable) Editorial-turned-list (a list being used to argue a specific point) ..."\nURL: https://www.process.st/listicle/\n\n[2] "For more information, see Create a column in a list or library. Views Change how a list is displayed by creating views. For more information, see Create, change, or delete a view of a list or library. Folders Add a subfolder to a list. For more information, see Create a folder in a list."\nURL: https://support.microsoft.com/en-us/office/create-a-list-0d397414-d95f-41eb-addd-5e6eff41b083\n\n[3] "Lists. Just as heading structure alerts readers to the order of ideas in a paper, lists help readers understand a related set of key points within a sentence or paragraph. When writing a list, ensur