In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [21]:
candidate_labels = ["animals", "animals activity", "water", "scenery", "weather", "human activity", "language", "plants", "tools"]

In [1]:
import pandas as pd
words_and_embedding = pd.read_csv("words_and_embedding.csv", index_col="Unnamed: 0")
docs = list(words_and_embedding["Definitions"])

zero_shot_results = pd.read_csv("result_with_probabilities.csv")
labels = list(zero_shot_results["Max_Probability_Label"])

In [None]:
labels = [candidate_labels.index(label) for label in labels]
labels

In [5]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model1= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)
topics, probs = topic_model1.fit_transform(docs, y=labels)

In [None]:
words_and_embedding

In [5]:
import numpy as np
reduced_embeddings = np.array(words_and_embedding[["0", "1"]])

In [None]:
reduced_embeddings

In [33]:
topic_model1.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [9]:
labels2 = list(zero_shot_results["Second_Max_Probability_Label"])
labels2 = [candidate_labels.index(label) for label in labels2]
topic_model2 = BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)
topics, probs = topic_model2.fit_transform(docs, y=labels2)
topic_model2.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [10]:
mappings = topic_model1.topic_mapper_.get_mappings()
mappings = {value: candidate_labels[key] for key, value in mappings.items()}

# Assign original classes to our topics
df = topic_model1.get_topic_info()
df["Class"] = df.Topic.map(mappings)
df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Class
0,0,2325,0_be_on_up_one,"[be, on, up, one, to, over, in, and, do, people]",[to be old to have lived at one place for a lo...,human activity
1,1,540,1_describe_particle_form_emphatic,"[describe, particle, form, emphatic, final, in...","[used with a verb it has a negative effect, us...",language
2,2,454,2_fish_bird_type_small,"[fish, bird, type, small, that, of, trap, dog,...","[type of snake, worm that lives in water and c...",animals activity
3,3,160,3_bamboo_plant_leaf_trees,"[bamboo, plant, leaf, trees, type, tree, edibl...",[rice that grows from grains that have fallen ...,plants
4,4,125,4_animals_animal_carabao_chicken,"[animals, animal, carabao, chicken, pig, meat,...","[leader, boss; full grown animal (possibly fem...",animals
5,5,104,5_water_river_sea_stream,"[water, river, sea, stream, dive, flowing, bot...","[trap for fish made by putting poles in pond, ...",water
6,6,52,6_look_gone_valley_stone,"[look, gone, valley, stone, field, hill, brigh...","[nothing there, completely gone , completely g...",scenery
7,7,44,7_season_rain_hot_wind,"[season, rain, hot, wind, fog, clouds, thunder...","[a shelter from wind and rain, hot dry season,...",weather


In [12]:
mappings = topic_model2.topic_mapper_.get_mappings()
mappings = {value: candidate_labels[key] for key, value in mappings.items()}

# Assign original classes to our topics
df = topic_model2.get_topic_info()
df["Class"] = df.Topic.map(mappings)
df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Class
0,0,1230,0_on_large_of_rice,"[on, large, of, rice, something, the, noise, w...","[a tree with black wood that is very hard, a b...",animals activity
1,1,1209,1_be_do_people_person,"[be, do, people, person, someone, one, many, t...",[type of wine jar which is the most expensive ...,language
2,2,721,2_particle_water_used_describe,"[particle, water, used, describe, for, in, not...",[often precedes numbers seeming to have the id...,human activity
3,3,234,3_bird_fish_dog_snake,"[bird, fish, dog, snake, horse, frog, animal, ...","[type of snake, worm that lives in water and c...",animals
4,4,146,4_water_wash_cup_surfaced,"[water, wash, cup, surfaced, body, particle, s...",[cleansing ceremony to get rid of bad elements...,water
5,5,132,5_house_tree_beautiful_trees,"[house, tree, beautiful, trees, sky, night, ar...",[to describe thick grass dark night when one c...,scenery
6,6,79,6_bamboo_rice_tree_grass,"[bamboo, rice, tree, grass, leaves, wall, eats...","[bee-like insect that in tree or ground, small...",plants
7,7,53,7_cold_rain_afternoon_piece,"[cold, rain, afternoon, piece, dry, roof, moon...","[sickness in stomach as when wet from rain, to...",weather


Run zero-shot on topics and not individual words.

In [22]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import ZeroShotClassification


sentence_model = SentenceTransformer("multi-qa-MiniLM-L6-dot-v1")
embeddings = sentence_model.encode(docs, show_progress_bar=False)

topic_model = BERTopic(umap_model=UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean'), 
                       representation_model= ZeroShotClassification(candidate_labels, model="facebook/bart-large-mnli", min_prob = 0.4))
topics, probs = topic_model.fit_transform(docs, embeddings)

In [23]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [24]:
topic_model.get_topic_info().to_csv("./results/Topics_on_zero_shot.csv")