In [2]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

zeroshot_topic_list = [
    "searching knowledge",
    "answer coding problem",
    "summarizing",
    "rephrasing",
    "roleplay",
    "translate",
    "generate content",
]
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(
    min_topic_size=20,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=0.25,
    vectorizer_model=vectorizer_model,
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,494,answer coding problem,"[code, answer, number, snippet, given, functio...",[You are given a code snippet that contains a ...
1,1,412,summarizing,"[story, write, essay, use, text, paragraph, wr...",[Develop a detailed writing prompt that challe...
2,2,259,generate content,"[content, generate, create, html, write, websi...",[I Want You To Act As A Content Writer Very Pr...
3,3,253,searching knowledge,"[search, keyword, data, learning, list, ai, ke...",[Please ignore all previous instructions. I wa...
4,4,208,roleplay,"[play, character, team, game, like, players, o...",[ENTRY 2 \n\nVal's Early Life\n\n\*\*\n\nStayi...
...,...,...,...,...,...
59,59,23,51_python_gui_turtle_tkinter,"[python, gui, turtle, tkinter, mainloop, appli...","[Web search results:\n\n[1] ""A tuple of three ..."
60,60,22,52_css_attry_attrx_const,"[css, attry, attrx, const, 20px, marginbottom,...",[I'd like to provide some context on a program...
61,61,20,53_time_habits_tweet_ruby,"[time, habits, tweet, ruby, budget, management...","[As a programmer, I often struggle with keepin..."
62,62,20,54_osi_networks_tcpip_area,"[osi, networks, tcpip, area, network, differen...","[If there is a 4TB hard drive that costs $125,..."


In [3]:
import torch

torch.cuda.is_available()

True

### Downgrading numpy

To 1.22.4 because bertopic does not work with seed topic
https://github.com/MaartenGr/BERTopic/issues/1814

In [4]:
from bertopic import BERTopic
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

import numpy

print(f"Version: {numpy.__version__}")

data = load_dataset("HuggingFaceH4/h4_10k_prompts_ranked_gen")
docs = data["train_gen"]["prompt"]

vectorizer_model = CountVectorizer(stop_words="english")

seed_topic_list = [
    ["when", "date", "time"],
    ["who"],
    ["where"],
    ["how"],
    ["rephrase", "reword"],
    ["translate"],
    ["extract"],
    ["code", "coding", "python"],
    ["imagine", "act", "assume", "role"],
]
# searching knowledge', 'answer coding problem', 'summarizing', 'rephrasing', 'roleplay', 'translate', 'generate content

topic_model = BERTopic(
    seed_topic_list=seed_topic_list,
    vectorizer_model=vectorizer_model,
    min_topic_size=25,
)

topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Version: 1.22.4


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3434,-1_use_write_provide_data,"[use, write, provide, data, using, new, create...",[Imagine you are working for a company that ai...
1,0,1160,0_code_function_python_snippet,"[code, function, python, snippet, file, input,...",[You are given a code snippet that prints the ...
2,1,318,1_english_1111_translate_language,"[english, 1111, translate, language, meaning, ...","[what is English language, $LANGUAGE = French\..."
3,2,306,2_time_date_null_day,"[time, date, null, day, answer, varchar255, ye...",[You have been assigned to implement a custom ...
4,3,292,3_india_router_hibiscus_cache,"[india, router, hibiscus, cache, credit, newwi...",[Given the text: Malvaceae week :: Hibiscus sa...
5,4,262,4_marketing_business_brand_media,"[marketing, business, brand, media, market, pl...","[I want you to write a name for this prompt: ""..."
6,5,237,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...
7,6,235,6_mui_said_monday_did,"[mui, said, monday, did, black, mahuika, won, ...",[Generate response to the question/instruction...
8,7,235,7_story_character_characters_write,"[story, character, characters, write, plot, bo...","[tell me a short story, write a short story, P..."
9,8,202,8_ev_2017_june_ended,"[ev, 2017, june, ended, waze, months, city, jo...",[Create a perfect title headline about these a...


In [5]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3434,-1_use_write_provide_data,"[use, write, provide, data, using, new, create...",[Imagine you are working for a company that ai...
1,0,1160,0_code_function_python_snippet,"[code, function, python, snippet, file, input,...",[You are given a code snippet that prints the ...
2,1,318,1_english_1111_translate_language,"[english, 1111, translate, language, meaning, ...","[what is English language, $LANGUAGE = French\..."
3,2,306,2_time_date_null_day,"[time, date, null, day, answer, varchar255, ye...",[You have been assigned to implement a custom ...
4,3,292,3_india_router_hibiscus_cache,"[india, router, hibiscus, cache, credit, newwi...",[Given the text: Malvaceae week :: Hibiscus sa...
5,4,262,4_marketing_business_brand_media,"[marketing, business, brand, media, market, pl...","[I want you to write a name for this prompt: ""..."
6,5,237,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...
7,6,235,6_mui_said_monday_did,"[mui, said, monday, did, black, mahuika, won, ...",[Generate response to the question/instruction...
8,7,235,7_story_character_characters_write,"[story, character, characters, write, plot, bo...","[tell me a short story, write a short story, P..."
9,8,202,8_ev_2017_june_ended,"[ev, 2017, june, ended, waze, months, city, jo...",[Create a perfect title headline about these a...


In [6]:
topic_model.get_representative_docs()

{-1: ["Imagine you are working for a company that aims to reduce its carbon footprint. Your task is to develop a Java program that can analyze the company's energy consumption patterns and recommend strategies to optimize resource usage. The program should also be able to predict the company's future energy needs and suggest ways to meet those needs sustainably.\nTo accomplish this, you will need to utilize Java's data processing capabilities and machine learning techniques. Your program should be able to take in data on the company's energy consumption, such as electricity and gas usage, and analyze patterns over time. It should then be able to use this data to make predictions about future energy needs and suggest ways to reduce consumption.\nSome possible strategies that your program could recommend include using renewable energy sources, optimizing building insulation and heating systems, and reducing unnecessary energy usage during non-peak hours. The program should also take into

In [10]:
doc_info_df = topic_model.get_document_info(docs)
doc_info_df[doc_info_df['Topic'] == 5]

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
2,Seismologists: How do you determine whether an...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.946019,False
62,Write a detailed editorial that discusses the ...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.807697,False
64,Imagine you're a scientist studying the lesser...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.787190,False
75,"Web search results:\n\n[1] ""Firstly, the indus...",5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.968005,False
130,What is the impact of artificial light polluti...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.961912,False
...,...,...,...,...,...,...,...,...
9608,Can you discuss the role of agroforestry and s...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.997100,False
9653,Can you design a comprehensive and sustainable...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,1.000000,False
9693,How have the infrastructure and economy of New...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.749648,False
9756,1. Definition of waste management and its impo...,5,5_energy_climate_species_impact,"[energy, climate, species, impact, renewable, ...",[Rank the following energy sources according t...,energy - climate - species - impact - renewabl...,0.899819,False


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(docs)

In [None]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

In [None]:
# print(f"topic_distr:\n{topic_distr}\ntopic_token_distr:\n{topic_token_distr}\n")

# Visualize the token-level distributions
idx = 105
print(f"docs:\n{docs[idx]}")
df = topic_model.visualize_approximate_distribution(docs[idx], topic_token_distr[idx])
df

docs:
pretend you are a rabbi and write a sermon on the mussar value of humility with an introductory amusing story taken from jewish text


Unnamed: 0,pretend,you,are,rabbi,and,write,sermon,on,the,mussar,value,of,humility,with,an,introductory,amusing,story,taken,from,jewish,text
8_story_character_write_characters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141,0.273,0.433,0.618,0.477,0.344,0.185,0.0
