# Load a dataset with different topics

- we'll split into a "train" and "test"
- we'll drop a category at random from the train
- then we'll use the "test" with +1 category, to see if we can detect its appearance

In [2]:
from datasets import load_dataset

In [3]:
import pandas as pd

In [4]:
ds = load_dataset("community-datasets/yahoo_answers_topics")

README.md:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})

In [8]:
df = ds["train"].to_pandas()

In [6]:
df.head(10)

Unnamed: 0,id,topic,question_title,question_content,best_answer
0,0,4,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,1,5,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,2,2,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,3,6,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,4,6,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...
5,5,4,What's the best way to clean a keyboard?,I have very small stuff stuck under my keyboar...,"There are commercial kits available, but a can..."
6,6,1,Why do people blush when they are embarrassed?,Why do people blush when they are embarrassed?,from ask yahoo...\nhttp://ask.yahoo.com/ask/20...
7,7,7,"Is Lin Qingxia (aka Brigitte Lin) ""the most be...",This is according to Stephen Chow (http://www....,Well. Everyone has different definition on wh...
8,8,4,"What is the origin of ""foobar""?",I want to know the meaning of the word and how...,"Not sure if this is the origin, but I think it..."
9,9,1,How the human species evolved?,How the human species evolved?,A tough question as it overlaps science and th...


In [5]:
from sklearn.model_selection import train_test_split

In [8]:
df_small, _ = train_test_split(df, test_size=0.9, stratify=df["topic"], random_state=123)

In [9]:
df_small.shape

(140000, 5)

In [10]:
df_small["topic"].value_counts()

topic
6    14000
9    14000
0    14000
3    14000
8    14000
2    14000
1    14000
7    14000
4    14000
5    14000
Name: count, dtype: int64

### Drop all topics = 8

This is to pretend that v1 is "old" dataset, before new data arrives (with a "new" category hidden in it that we have to discover)

- this is the family and relationships category

In [11]:
df_v1 = df_small[df_small["topic"] != 8]

In [12]:
df_v1["topic"].value_counts()

topic
6    14000
9    14000
0    14000
3    14000
2    14000
1    14000
7    14000
4    14000
5    14000
Name: count, dtype: int64

# Visualize embeddings

In [13]:
!pip install sentence-transformers

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.11.0->sentence-transformers)
  Downloading nvid

In [14]:
from sentence_transformers import SentenceTransformer

2025-04-27 19:57:20.312376: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745783840.496978      66 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745783840.555450      66 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [16]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import umap
import plotly.express as px

In [17]:
model_checkpoint = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_checkpoint)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 1 min on GPU, about 20 mins on CPU for this dataset

In [18]:
embeddings = model.encode(
    df_v1["question_title"].tolist(),
    convert_to_tensor=True
)

Batches:   0%|          | 0/3938 [00:00<?, ?it/s]

### Next step takes 2 mins or so

In [19]:
# umap
umap_model = umap.UMAP(
    n_neighbors=5,
    n_components=2,
    metric="cosine"
)

# get embeddings
umap_embeddings = umap_model.fit_transform(embeddings.cpu().numpy())

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


In [21]:
le = LabelEncoder()
labels = le.fit_transform(df_v1["topic"]) # note that this isn't needed since already have labels=ints

In [23]:
!pip install plotly

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [26]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# trying to get Kaggle to plot plotly O_o
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [28]:
from IPython.display import display

In [37]:
# plotly DF
plot_df = pd.DataFrame(umap_embeddings, columns=["UMAP 1", "UMAP 2"])
plot_df["Topic"] = df_v1["topic"].astype("category")

fig = px.scatter(plot_df,
                x="UMAP 1",
                y="UMAP 2",
                color="Topic",
                title="UMAP projection of text embeddings",
                labels={"UMAP 1":"UMAP 1", "UMAP 2":"UMAP 2"},
                hover_data=["Topic"],
                color_discrete_sequence=px.colors.qualitative.Set3)

fig.update_layout(
    title_x=0.5,
    plot_bgcolor="white",
    width=1200,
    height=900,
    showlegend=True
)

fig.show(renderer='iframe_connected') # KAGGLE FORUM FINALLY

# SetFit for few label annotation classification

- one idea here is to train a classifier on v1, then inference on samples from v2 i.e. containing a brand new category
- Q: do the samples from new category have low scores on all the existing labels? If so, suggests new class.

In [38]:
df_small.shape

(140000, 5)

In [39]:
df_small["topic"].value_counts()

topic
6    14000
9    14000
0    14000
3    14000
8    14000
2    14000
1    14000
7    14000
4    14000
5    14000
Name: count, dtype: int64

In [40]:
!pip install setfit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting setfit
  Downloading setfit-1.1.2-py3-none-any.whl.metadata (12 kB)
Collecting evaluate>=0.3.0 (from setfit)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=2.15.0->setfit)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading setfit-1.1.2-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate, setfit
  Attempting uninstall: fsspec
    Found existing insta

In [51]:
from setfit import SetFitModel, Trainer, TrainingArguments

In [42]:
from datasets import Dataset

In [43]:
df_v1.shape

(126000, 5)

### Pretend you annotate only 20 samples of each category

In [44]:
df_setfit = df_v1.groupby("topic").apply(lambda example: example.sample(n=20)).reset_index(drop=True)





In [45]:
df_setfit.shape

(180, 5)

In [46]:
df_setfit["topic"].value_counts()

topic
0    20
1    20
2    20
3    20
4    20
5    20
6    20
7    20
9    20
Name: count, dtype: int64

In [47]:
setfit_ds = Dataset.from_pandas(df_setfit)

In [54]:
setfit_model = SetFitModel.from_pretrained("intfloat/multilingual-e5-large-instruct")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [55]:
args = TrainingArguments(
    batch_size=64,
    num_epochs=1,
    num_iterations=1, # reduced as was taking long time
)

trainer = Trainer(
    model=setfit_model,
    args=args,
    train_dataset=setfit_ds,
    column_mapping={"question_title":"text", "topic":"label"}
)

import time

start_time = time.time()
trainer.train()
end_time = time.time()

Applying column mapping to the training dataset


Map:   0%|          | 0/180 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 360
  Batch size = 64
  Num epochs = 1


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [None]:
#print(end_time-start_time)

# Interrupted, doesn't seem to work anymore

---

# Try BERTopic

In [56]:
!pip install bertopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.6/150.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: hdbscan, bertopic
Successfully installed bertopic-0.17.0 hdbscan-0.8.40


In [67]:
df_topic, _ = train_test_split(df, test_size=0.98, stratify=df["topic"], random_state=123)


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.



In [68]:
df_topic.shape # 28_000 samples

(28000, 5)

In [60]:
from bertopic import BERTopic


The 'shapely.geos' module is deprecated, and will be removed in a future version. All attributes of 'shapely.geos' are available directly from the top-level 'shapely' namespace (since shapely 2.0.0).



In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
# remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words="english")

In [70]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

In [71]:
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language='english',
    calculate_probabilities=True,
    verbose=True
)

In [72]:
topics, probs = topic_model.fit_transform(df_topic["best_answer"])

2025-04-27 21:08:46,246 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/875 [00:00<?, ?it/s]

2025-04-27 21:09:17,211 - BERTopic - Embedding - Completed ✓
2025-04-27 21:09:17,212 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-27 21:09:30,413 - BERTopic - Dimensionality - Completed ✓
2025-04-27 21:09:30,415 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [77]:
topic_model.visualize_barchart() #doesnt work in kaggle

In [80]:
topic_model.visualize_heatmap(top_n_topics=30)

In [79]:
topic_model.visualize_hierarchy()


# LLM Approach O_o

- ask model to classify according to existing labels, or other
- if other, ask for suggestion

In [6]:
!pip install outlines

Collecting outlines
  Downloading outlines-0.2.3-py3-none-any.whl.metadata (18 kB)
Collecting interegular (from outlines)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Collecting lark (from outlines)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting diskcache (from outlines)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting iso3166 (from outlines)
  Downloading iso3166-2.1.1-py3-none-any.whl.metadata (6.6 kB)
Collecting airportsdata (from outlines)
  Downloading airportsdata-20250224-py3-none-any.whl.metadata (9.0 kB)
Collecting outlines_core==0.1.26 (from outlines)
  Downloading outlines_core-0.1.26-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting genson (from outlines)
  Downloading genson-1.3.0-py3-none-any.whl.metadata (28 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->outlines)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

In [9]:
df_llm, _ = train_test_split(df, test_size=0.9995, stratify=df["topic"], random_state=123)

In [10]:
df_llm.shape

(700, 5)

In [11]:
ALL_CATEGORIES = {
    0:"society and culture",
    1:"science and mathematics",
    2:"health",
    3:"education and reference",
    4:"computers and internet",
    5:"sports",
    6:"business and finance",
    7:"entertainment and music",
    8:"family and relationships",
    9:"politics and government"
}

In [12]:
# pretend that we dont know that SPORTS exist, so drop from categories
CATEGORIES_NO_SPORTS = [v for k,v in ALL_CATEGORIES.items() if v!="sports"]

CATEGORIES_NO_SPORTS

['society and culture',
 'science and mathematics',
 'health',
 'education and reference',
 'computers and internet',
 'business and finance',
 'entertainment and music',
 'family and relationships',
 'politics and government']

In [13]:
# add "OTHER" to categories
CATEGORIES_NO_SPORTS.append("other")

In [14]:
from enum import Enum
from typing import Literal
from pydantic import BaseModel, constr

import outlines

In [15]:
class PredictedCategory(BaseModel):
    predicted_label: Literal[tuple(CATEGORIES_NO_SPORTS)]
    description_if_other: str

In [16]:
llm_model = outlines.models.transformers("Qwen/Qwen2.5-3B-Instruct")

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

2025-04-27 21:32:09.727417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745789529.911913      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745789529.964828      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [17]:
generator = outlines.generate.json(llm_model, PredictedCategory)

In [21]:
prompt_template = """Classify the following sentence into one of the allowed categories below.

If none of the allowed categories is a good description, use the \'other\' option and - in that case only - provide a short label that you think accurately classifies the sentence.

# Sentence to classify

{input_query}

# Allowed categories

- society and culture
- science and mathematics
- health
- education and reference
- computers and internet
- business and finance
- entertainment and music
- family and relationships
- politics and government
- other
"""

In [22]:
example_query = "How do I get updates on my stock portfolio??"
test_classif_1 = generator(prompt_template.format(input_query=example_query))

In [23]:
test_classif_1

PredictedCategory(predicted_label='other', description_if_other='The sentence is seeking information related to finance, but the available options do not include it.')