Load the train dataset, and the train corpus

Load the topic models fitted in a previous notebook.
* lda_gw: Gravitational Waves topics

Load the tokenized train dataset

4. Assign topics to all entries in the test dataset
5. Save the assigned topics to a CSV file

In [1]:
import pickle
from transformers import pipeline
import pandas as pd
from transformers import pipeline
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
nltk.download('punkt')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/atroncos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Load the topic models fitted in a previous notebook.
* lda_gw: Gravitational Waves topics


In [2]:
with open('../models/lda_gw.pickle', 'rb') as handle:
    lda_gw = pickle.load(handle)

Load the train dataset, and the train corpus

In [3]:
gw_validate = pd.read_csv('../data/gw_validate.csv.zip', index_col=0)

In [4]:
with open('../data/corpus_validate_gw.pickle', 'rb') as handle:
    corpus_validate_gw = pickle.load(handle)

## Assign topics to the data

Aggregate topic information in a dataframe (see: https://campus.datacamp.com/courses/fraud-detection-in-python/fraud-detection-using-text?ex=11)

In [5]:
def get_topic_details(ldamodel, corpus):
    topic_details_list = []
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_list.append([topic_num, prop_topic, row])
    topic_details_df = pd.DataFrame(topic_details_list)
    topic_details_df.columns = ['Dominant_Topic', '% Score', 'Topics']
    return topic_details_df

In [6]:
def assign_topics(ldamodel, corpus, df):
    # put the arxiv id, original categories data and creation date in a dataframe
    # combine with result of topic details function
    topics_df = pd.DataFrame()
    topic_details = get_topic_details(ldamodel, corpus)
    topics_df['id'] = list(df['id'])
    topics_df['title'] = list(df['title'])
    topics_df['year'] = list(df['year'])
    topics_df['month'] = list(df['month'])
    topics_df['Dominant Topic'] = topic_details['Dominant_Topic']
    topics_df['% Score'] = topic_details['% Score']
    topics_df['Topics'] = topic_details['Topics']
    return topics_df

In [7]:
topics_gw_validate = assign_topics(lda_gw, corpus_validate_gw, gw_validate)

In [8]:
topics_gw_validate.head()

Unnamed: 0,id,title,year,month,Dominant Topic,% Score,Topics
0,gr-qc/9709023,"Matters of Gravity, the newsletter of the APS ...",1997,9,3,0.388621,"[(3, 0.38862106), (0, 0.30946887), (2, 0.29763..."
1,1703.00169,CMB internal delensing with general optimal es...,2017,3,0,0.461642,"[(0, 0.46164203), (2, 0.39299756), (3, 0.14211..."
2,2112.0216,A Detection of Red Noise in PSR J1824$-$2452A ...,2021,12,0,0.789054,"[(0, 0.7890542), (1, 0.14857039), (2, 0.060367..."
3,1708.05621,Power radiated by a binary system in a de Sitt...,2017,8,3,0.684204,"[(3, 0.68420434), (2, 0.17142653), (0, 0.08220..."
4,2305.10715,X-Ray Tests of General Relativity with Black H...,2023,5,1,0.451522,"[(1, 0.45152238), (3, 0.2228986), (0, 0.202941..."


Concatenate all the titles by dominant topic

In [28]:
def shuffle_titles(df, dominant_topic):
    """
    Concatenates all the titles for papers about this topic,
    the dominant topic is used to filter the papers. Paper titles are shuffled.
    
    dominant_topic: int id of the dominant topic for a paper
    returns: string
    """
    idx = df['Dominant Topic'] == dominant_topic
    df_idx = df[idx]
    df_idx = df_idx.reset_index()
    df_idx = df_idx.sample(frac=1)  # shuffle
    all_titles = '. '.join(df_idx['title'])
    return all_titles

In [10]:
def predict_label_ALT(all_titles, min_length=1, max_length=12):
    """
    Predict a label for a given concatenation of titles.
    """
    #model_name = 'fabiochiu/t5-small-medium-title-generation'
    #model_name = 'deep-learning-analytics/automatic-title-generation'
    model_name = 'google-t5/t5-small'
    #model_name = 'sshleifer/distilbart-cnn-12-6'
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    #inputs = [f"Select a suitable label for these keywords: {all_titles}"]
    #inputs = [f"Which topic is described by these keywords (response should be between 1 and 12 words): {all_titles}"]
    inputs = [f"summarize: {all_titles}"]
    #inputs = all_titles
    
    inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=1, max_length=12)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted_label = nltk.sent_tokenize(decoded_output.strip())[0]
    return(predicted_label)


In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

def predict_label(all_titles, min_length=1, max_length=12):
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

    input_text = [f"summarize: {all_titles}"]
    input_ids = tokenizer(input_text, max_length=1024*15, truncation=True, return_tensors="pt").input_ids.to('cpu')

    outputs = model.generate(input_ids, num_beams=8, do_sample=True, min_length=min_length, max_length=max_length)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    predicted_label = nltk.sent_tokenize(decoded_outputs.strip())[0]

    return(predicted_label)  

In [12]:
def predict_topic_labels(df, min_length, max_length):
    topics_range = set(df['Dominant Topic'])
    labels = []
    topics = []
    for topic in topics_range:
        print(f"Processing topic {topic} / {len(topics_range)}")
        all_titles = shuffle_titles(df, topic)
        label = predict_label(all_titles, min_length, max_length)
        topics.append(topic)
        labels.append(label)
        print(f"label: {label}")
    return(pd.DataFrame.from_dict({'topic': topics, 'label': labels}))

In [29]:
%%time


df1 = predict_topic_labels(topics_gw_validate, 1, 15)
df2 = predict_topic_labels(topics_gw_validate, 1, 15)

Processing topic 0 / 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


label: Search for continuous gravitational waves from isolated neutron stars
Processing topic 1 / 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


label: A New Method to Calculate the Stochastic Background of Gravitati
Processing topic 2 / 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


label: The Constraining Capability of BNS Dark Sirens Observ
Processing topic 3 / 4
label: scalar-tensor theory
Processing topic 0 / 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


label: Searching for continuous gravitational waves from unknown neutron stars in
Processing topic 1 / 4


KeyboardInterrupt: 

In [14]:
pd.concat([df1, df2]).reset_index()

Unnamed: 0,index,topic,label
0,0,0,Search for continuous gravitational waves from...
1,1,1,The influence of black holes on the binary pop...
2,2,2,Electromagnetic Waves from Primordial Black Holes
3,3,3,Constraints on Gravitational Waves in Curved
4,0,0,Detection of gravitational waves from unknown ...
5,1,1,The influence of black holes on the binary pop...
6,2,2,Electromagnetic Waves from Primordial Black Holes
7,3,3,Extraction of Gravitational Waves in Curved Sp...


In [23]:
len(shuffle_titles(topics_gw_validate, 1))

89121

In [22]:
len('. '.join(topics_gw_validate['title']))

307014

In [27]:
df = topics_gw_validate
dominant_topic = 0
idx = df['Dominant Topic'] == dominant_topic
df_idx = df[idx]
#df_idx = df_idx.reset_index()
#df_idx = df_idx.sample(frac=1)  # shuffle
all_titles = '. '.join(df_idx['title'])
len(all_titles)

70200

In [33]:
from transformers import pipeline

# Initialize the text generation pipeline
model_name = "google-t5/t5-small"
generator = pipeline('text-generation', model=model_name)  # Replace with the model you prefer

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create the prompt
prompt = f"Generate a human-readable label for the following list of terms: {', '.join(terms)}."

# Generate the label
result = generator(prompt, max_length=50, num_return_sequences=1)

# Extract and print the generated label
label = result[0]['generated_text']
print(label)


The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCau

Generate a human-readable label for the following list of terms: machine learning, neural networks, deep learning, AI, supervised learning.Generate a human-readable label for the following list of terms: machine learning, neural networks


In [35]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can use other variants of BART if needed
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create the input text from the list of terms
input_text = f"Generate a human-readable label for the following list of terms: {', '.join(terms)}."

# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

# Generate the summary (label)
summary_ids = model.generate(inputs, max_length=50, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
label = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated label
print(label)


Generate a human-readable label for the following list of terms: machine learning, neural networks, deep learning, AI.


In [36]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can use other variants of BART if needed
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create the input text from the list of terms
input_text = f"Here is a list of related terms: {', '.join(terms)}. Provide a concise label that describes these terms."

# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

# Generate the summary (label)
summary_ids = model.generate(inputs, max_length=20, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
label = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated label
print(label)


Here is a list of related terms: machine learning, neural networks, deep learning,


In [49]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-small"  # You can use other variants like 't5-base' or 't5-large'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create the input text from the list of terms
input_text = f"label: {', '.join(terms)}"

# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

# Generate the label
outputs = model.generate(inputs, max_length=20, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
label = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated label
print(label)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Etikett: machine learning, neural networks, deep learning, AI, supervised learning,


In [63]:
str = shuffle_titles(topics_gw_validate, 0)

In [64]:
str[:100]

'Semianalytical Approach for Sky Localization of Gravitational Waves. Un-modeled search for black hol'

In [69]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-large"  # You can use other variants like 't5-base' or 't5-large'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# List of terms
terms = ["neural networks", "deep learning", "AI", "supervised learning", "machine learning"]


# Create the input text from the list of terms with more context
#input_text = f"Generate a concise and descriptive label that captures the essence of these terms: {', '.join(terms)}. "
# Create the input text from the list of terms with more context
#input_text = f"Summarize: {', '.join(terms)}. "
#input_text = f"Generate a concise and descriptive label that captures the essence of these terms: {str}. "
input_text = f"Make a sentence with these words: {', '.join(terms)}. "

# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

# Generate the label
outputs = model.generate(inputs, max_length=50, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
label = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated label
print(label)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


using these words: neural networks, deep learning, AI, supervised learning, machine learning., machine learning. Make a sentence with these words: neural networks, deep learning, AI, supervised learning, machine learning.


In [54]:
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create the input text from the list of terms
input_text = f"{', '.join(terms)}"

# Candidate labels to choose from
candidate_labels = ["Artificial Intelligence", "Computer Science", "Data Science", "Technology", "Education", "Science"]

# Perform zero-shot classification
result = classifier(input_text, candidate_labels)

# Print the label with the highest score
label = result['labels'][0]
print(label)


Artificial Intelligence


In [93]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the GPT-2 model and tokenizer
model_name = "gpt2"  # You can use 'gpt2-medium' or 'gpt2-large' for larger models
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# List of terms
terms = ["machine learning", "neural networks", "deep learning", "AI", "supervised learning"]

# Create a prompt that includes the terms
joined = ', '.join(f'"{w}"' for w in terms)
#prompt = f"Make one sentence that includes all of the following terms: {joined}."
#prompt = f"Summarize the following terms: {joined}."
#prompt = f"Here is a list of related terms: {joined}. Provide a concise label that describes these terms."
# prompt = f"{joined} and"

# Tokenize the prompt
inputs = tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True)

# Generate text
outputs = model.generate(inputs, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)

# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract the generated sentence after the prompt
generated_sentence = generated_text[len(prompt):].strip()

# Print the generated sentence
print(generated_sentence)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"machine-learning".

The following is a list of the most popular and popular machine learning frameworks.
.NET Core
 (http://www.netcore.org/)
:
,
-
 and
(http: //www-software.com/ )



Machine Learning
"Machine learning is the process of learning to learn new things. It is


In [80]:
prompt

'Make one sentence that includes all of the following terms: "machine learning", "neural networks", "deep learning", "AI", "supervised learning".'