# Label and describe using an LLM

Download and instantiate an LLM from Huggingface.

Load the LDA topic models. 

Prompt the LLM to generate a label and a description for each topic in the models.

In [31]:
import string
import re
import gc
import os

import pandas as pd
import pickle
from transformers import pipeline
import nltk
nltk.download('punkt')

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/kobv/atroncos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Load the topic models fitted in a previous notebook.

* lda_gw: Gravitational Waves topics
* lda_cscl: Computation and Language topics

In [2]:
# LDA gravitational waves model
with open('../models/lda_gw.pickle', 'rb') as handle:
    lda_gw = pickle.load(handle)

# Ensemble LDA gravitational waves model
with open('../models/ensemble_gw.pickle', 'rb') as handle:
    ensemble_gw = pickle.load(handle)


In [3]:
# LDA computing & language model
with open('../models/lda_cscl.pickle', 'rb') as handle:
    lda_cscl = pickle.load(handle)

# Ensemble LDA computing & language model
with open('../models/ensemble_cscl.pickle', 'rb') as handle:
    ensemble_cscl = pickle.load(handle)

Get a list of all topics in the model, each topic described by MAX_WORDS 

* The result is a list of topics. Each topic is represented by a tuple.
* The first element of the tuple is a topic number (int).
* The second element of the tuple is a list of tuples,
* Each tuple represents the words characterising he topic (string) and its corresponding probability (float)

In [4]:
MAX_WORDS = 30

# The expected format for the topics list is:
# list[tuples<int, list[tuple<string, float>]>]

# a Gensim LDA model
topics_gw = lda_gw.show_topics(num_words=MAX_WORDS, formatted=False)

# an Ensemble lDA model, has to be converted to Gensim LDA first
topics_ensemble_gw = ensemble_gw.generate_gensim_representation().show_topics(num_topics=-1, num_words=MAX_WORDS, formatted=False)

In [5]:
# a Gensim LDA model
topics_cscl = lda_cscl.show_topics(num_words=MAX_WORDS, formatted=False)

# an Ensemble lDA model, has to be converted to Gensim LDA first
topics_ensemble_cscl = ensemble_cscl.generate_gensim_representation().show_topics(num_topics=-1, num_words=MAX_WORDS, formatted=False)

### Using Llama gated model

1. Go to huggingface, login, go to `settings/access tokens` 
2. Create a new READ token, save it to ../token.txt
3. Go here: https://huggingface.co/meta-llama/Meta-Llama-3.1-8Band accept the usage conditions

In [6]:
from huggingface_hub import login
with open('../token.txt', 'r') as handle:
    token = handle.read()
login(token=token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/kobv/atroncos/.cache/huggingface/token
Login successful


In [7]:
def get_topic_str(topic, max=None):
    """Return the terms describing a topic as a string
    topic: list of tuples<string, float>
    """
    if not max:
        resp = ', '.join([term[0] for term in topic[1]])
    else:
        resp = ', '.join([term[0] for term in topic[1][:max]])    
    return(resp)

An wrapper class for the Llama LLM: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct

In [20]:
import transformers
import torch
from abc import ABC, abstractmethod

class Promptable(ABC):
    @abstractmethod
    def predict_label(self, category, terms): pass

    @abstractmethod
    def predict_description(self, category, terms): pass

class Mistral(Promptable):

    def __init__(self):
        self.model_id="mistralai/Mistral-7B-Instruct-v0.2"

    def predict_label(self, category, terms):
        messages = [
            {"role": "system", "content": f"You are an AI assistant specialised in {category}"},
            {"role": "user", "content": f"What short, concise and human-readable label best describes the topic characterised by these terms: {terms}? Output only the label"},
        ]
        outputs = self.pipeline(
            messages,
            max_new_tokens=25,
        )
        resp = outputs[0]["generated_text"][-1]['content']
        resp = re.sub(r'[\n]+', ' ', resp)  # replace newlines with spaces
        resp = re.sub(r'[^A-Za-z0-9 \-\.:()]+', '', resp)  # remove non-alphanumeric chars except space, hyphen, dot, column
        resp = re.split('\.|:', resp)[0] # split into sentences, take first one
        return resp.title()
    
    def predict_description(self, category, terms):
        messages = [
            {"role": "system", "content": f"You are an AI assistant specialised in {category}"},
            {"role": "user", "content": f"What best describes the topic characterised by these terms: {terms}?"},
        ]
        outputs = self.pipeline(
            messages,
            max_new_tokens=256,
        )
        return outputs[0]["generated_text"][-1]['content']
    
    def __enter__(self):
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=self.model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
            temperature=0.1,
            do_sample=True
        )
        return self

    def __exit__(self, *args):
        del self.pipeline
        # release GPU memory
        gc.collect()  # explicitly call garbage collector
        torch.cuda.empty_cache()


A function to create labels for all topics

In [21]:
def predict_topic_labels(category, topics, model):
    """
    Predict label for a list of topics.

    category: an arxiv category, see https://arxiv.org/category_taxonomy, e.g. "General Relativity and Quantum Cosmology"
    topics: topics in an LDA model, obtained through lda_XX.show_topics(num_words=MAX_WORDS, formatted=False)
    model: Object implementing abstract class "Promptable" (see above)
    returns: dataFrame with columns: topic id, label
    """
    topic_labels = []  # topic label, string, generated by LLM
    topic_ids = []  # topic id, numeric
    topic_main_words = []  # the first 5 keywords, as string
    topic_descriptions = []  # description of a topic
    
    topics_range = [topic[0] for topic in topics]
    for count, topic in enumerate(topics):
        print(f"Processing topic {count} / {len(topics_range)}")
        topic_id = topic[0]
        terms = get_topic_str(topic) # all keywords, as string

        # label
        label = model.predict_label(category, terms)
        topic_labels.append(label)

        # numeric topic id
        topic_ids.append(topic_id)

        # topic keywords
        topic_main_words.append(get_topic_str(topic, 5))

        # description
#        prompt = f"Describe the topic in the \"{category}\" category characterised by these terms: {terms}."
#        description = model.predict_description(category, terms)
#        topic_descriptions.append(description)
    return(pd.DataFrame.from_dict({'Topic': topic_ids, 'First 5 keywords': topic_main_words, 'Label': topic_labels}))

### Topics for Gravitational Waves LDA

In [22]:
%%time

with Mistral() as model:
    topics_gw_df = predict_topic_labels("General Relativity and Quantum Cosmology", topics_gw, model)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.74it/s]
Some parameters are on the meta device device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 0 / 4


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 1 / 4


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 2 / 4


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 3 / 4
CPU times: user 2min 9s, sys: 3.08 s, total: 2min 12s
Wall time: 2min 12s


In [23]:
topics_gw_df

Unnamed: 0,Topic,First 5 keywords,Label
0,0,"detector, signal, data, noise, frequency",Advanced Gravitational Wave Detection
1,1,"binary, hole, mass, black, star",Black Hole Binary Merger
2,2,"model, spectrum, energy, dark, background",Cosmological Dark Matter And Inflationary Power Spectrum (Cdm-Ips) Explanation
3,3,"field, theory, mode, gravity, equation",General Relativity And Quantum Gravity


### Topics for Gravitational Waves ensemble LDA

In [20]:
%%time

with Mistral() as model:
    topics_ensemble_gw = predict_topic_labels("General Relativity and Quantum Cosmology", topics_ensemble_gw, model)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.87it/s]
Some parameters are on the meta device device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 0 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 1 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 2 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 3 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 4 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 5 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 6 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 7 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 8 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 9 / 12


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 10 / 12


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 11 / 12
CPU times: user 6min 25s, sys: 6.72 s, total: 6min 31s
Wall time: 6min 32s


In [21]:
topics_ensemble_gw.sort_values(by='Topic')

Unnamed: 0,Topic,First 5 keywords,Label
0,0,"hole, black, binary, mass, spin",Black Hole Binaries
1,1,"search, signal, detector, data, ligo",Gravitational Wave Detection With Ligo-Virgo Network
2,2,"pulsar, timing, array, noise, data",Pulsar Timing Array Pta Analysis
3,3,"star, neutron, merger, mass, binary",Compact Star Mergers
4,4,"mode, star, instability, frequency, neutron",Rotating Neutron Star Instability
5,5,"ray, gamma, burst, energy, emission",Short-Duration Gamma-Ray Burst Grb Source
6,6,"binary, source, distance, parameter, mass",Binary Black Hole Merger
7,7,"theory, gravity, general, scalar, field",Modified Scalar-Tensor Theory Of Gravity
8,8,"model, dark, spectrum, inflation, matter",Inflationary Dark Matter-Energy Spectrum Idmesexplanation
9,9,"transition, phase, model, order, electroweak",Electroweak Phase Transition In The Early Universe


### Topics Computing & Language LDA

In [24]:
%%time

with Mistral() as model:
    topics_cscl_df = predict_topic_labels("Computation and Language", topics_cscl, model)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.71it/s]
Some parameters are on the meta device device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 0 / 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 1 / 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 2 / 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 3 / 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 4 / 6


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 5 / 6
CPU times: user 3min 14s, sys: 4.2 s, total: 3min 18s
Wall time: 3min 18s


In [25]:
topics_cscl_df

Unnamed: 0,Topic,First 5 keywords,Label
0,0,"data, research, user, analysis, text",Data-Driven Text Analysis For Social Media
1,1,"task, data, training, performance, learning",Machine Learning Model Training And Performance
2,2,"question, llm, human, knowledge, task",Natural Language Question Answering System (Nlqas) With Large Datasets And Human-Like Performance
3,3,"translation, speech, english, data, machine",Multilingual Neural Machine Translation (Speech-To-Text And Text-To-Text) With Error Cor
4,4,"word, based, method, sentence, representation",Semantic Sentence Representation Learning
5,5,"image, text, speech, visual, feature",Multimodal Computation And Language Processing (Mclp) Explanation


### Topics Computing & Language ensemble LDA

In [26]:
%%time

with Mistral() as model:
    topics_ensemble_cscl = predict_topic_labels("Computation and Language", topics_ensemble_cscl, model)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.90it/s]
Some parameters are on the meta device device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 0 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 1 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 2 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 3 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 4 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 5 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 6 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 7 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 8 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 9 / 15


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 10 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 11 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 12 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 13 / 15


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing topic 14 / 15
CPU times: user 7min 46s, sys: 8.16 s, total: 7min 55s
Wall time: 7min 55s


In [27]:
topics_ensemble_cscl.sort_values(by='Topic')

Unnamed: 0,Topic,First 5 keywords,Label
0,0,"translation, machine, data, nmt, neural",Neural Machine Translation (Nmt) - Training Performance Explanation
1,1,"question, answer, answering, task, reasoning",Question Answering And Information Retrieval (Qair)
2,2,"llm, task, large, performance, prompt",Large-Scale Llm (Language Model) Performance
3,3,"speech, data, task, recognition, training",Speech Recognition System
4,4,"bias, gender, data, task, based",Gender Bias In Large-Scale Nlp Systems
5,5,"dialogue, task, state, system, human",Dialogue System
6,6,"style, knowledge, task, transfer, text",Text-Based Transfer Learning For Sentence-Level Tasks
7,7,"evaluation, human, metric, task, summarization",Automatic Text Summarization Evaluation
8,8,"topic, approach, document, method, word",Topic Modeling With Neural Lda And Text Data Analysis
9,9,"event, argument, extraction, task, method",Event-Based Argument Mining


## Save labels

In [32]:
LABELS_PATH = '../models'
topics_ensemble_cscl.to_csv(os.path.join(LABELS_PATH, 'labels_ensemble_cscl.csv'), index=False)  