# Quantum Tech Topic Modelling

In [12]:
from torch import cuda

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)
# conda install pytorch==1.13.0 torchvision==0.14.0 torchaudio==0.13.0 pytorch-cuda=11.7 -c pytorch -c nvidia

cuda:0


In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score
import re

### Simple data pre-processing

In [None]:
import pandas as pd
import re

df = pd.read_csv('QuanTech_R/R file/quant_pub.csv')
df['qc_category1'] = df['qc_category'].str[:3]
df['desc'] = df['abstract'].fillna(df['itemtitle'])
# Pre-processing
df['cleaned_docs'] = [s.replace('\r', '').replace('\n', '').replace('<p>', '').replace('</p>', '') for s in df['desc']]
df['cleaned_docs'] = [re.sub(r'\(C\) 20.*', '', text) for text in df['cleaned_docs']]
df = df[df['cleaned_docs']!='Editorial Board']
df = df.drop_duplicates()
print(df.shape)

df = df[['pubid', 'pubyear', 'cleaned_docs', 'qc_category1']]
df = df.drop_duplicates()
df.head()

In [None]:
df.qc_category1.value_counts()

### BERTopic

In [11]:
qc_type = df.qc_category1.unique()

for i in qc_type:
    temp_df = df.loc[df.qc_category1 == i]
    docs = temp_df['cleaned_docs'].to_list()
    
    from bertopic import BERTopic
    from umap import UMAP
    from hdbscan import HDBSCAN
    from sklearn.metrics import silhouette_score
    from bertopic import BERTopic

    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer_model = CountVectorizer(stop_words="english")

    from bertopic.vectorizers import ClassTfidfTransformer
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    # Set Diveristy of Topics (0: no diversity, 1: max diversity)
    from bertopic.representation import MaximalMarginalRelevance
    # representation_model = MaximalMarginalRelevance(diversity=0.5)

    # Set embedding model
    from sentence_transformers import SentenceTransformer
    # Pre-calculate embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(docs, 
                                        show_progress_bar=True)

    # Hyper parameter setting
    n_gram_ranges = (1, 1)
    min_topic_sizes = int(len(docs)*0.001)
    nr_topics_options = 20

    topic_model = BERTopic(
            n_gram_range=n_gram_ranges,
            min_topic_size = min_topic_sizes, 
            nr_topics = nr_topics_options, 
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
    #         representation_model=representation_model,
            calculate_probabilities=True,
            ctfidf_model = ctfidf_model,
            umap_model = umap_model 
        )
    
    topics, probs = topic_model.fit_transform(docs, embeddings)
    
    freq = topic_model.get_topic_info()
    freq.to_csv('QuanTech_Python/freq_'+ i +'_hur.csv', index=False)

    topic_model_sum = topic_model.get_document_info(docs)
    
    topic_model_sum = pd.merge(temp_df[['pubid','cleaned_docs']], 
                            topic_model_sum.loc[:, ~topic_model_sum.columns.isin(['Name''Representation','Representative_Docs'])], 
                            left_on='cleaned_docs', right_on='Document')

    topic_model_sum.to_csv("QuanTech_Python/"+i+"_sumtable_hur.csv")
    topic_model.save("QuanTech_Python/"+i+"bertopic_model_hur")


Batches: 100%|██████████| 692/692 [00:15<00:00, 45.03it/s]
Batches: 100%|██████████| 1897/1897 [00:43<00:00, 43.75it/s] 
Batches: 100%|██████████| 115/115 [00:02<00:00, 40.74it/s]
Batches: 100%|██████████| 2901/2901 [01:07<00:00, 42.91it/s] 


### BERTopic Label Creation

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM

tokenizer = AutoTokenizer.from_pretrained("D:/LLM/meta-llama/Meta-Llama-3-8B-Instruct", use_auth_token=True) # meta-llama/Meta-Llama-3-70B-Instruct
model = LlamaForCausalLM.from_pretrained("D:/LLM/meta-llama/Meta-Llama-3-8B-Instruct", #meta-llama/Meta-Llama-3-70B-Instruct
    torch_dtype=torch.float16)

# model.config.pad_token_id = tokenizer.eos_token_id

model.to(device)

In [None]:

import glob

### Get list of files
files = glob.glob('QuanTech_Python/freq_*_hur.csv')
files = [file for file in files if '_gentopic' not in file]

### Iterative statments for generating "GenTopic"
for file_name in files:
    print(file_name)

    period_name = re.search(r'freq_(.*).csv', file_name).group(1)

    ### Data load & preparation
    dat = pd.read_csv(file_name)

    dat = dat[dat.Topic != -1] # needed if want to remove outlier
    dat = dat.reset_index()
    dat['GenTopic'] = "" 

    prompt = """.\n List of words above are the topic modelling result from quantum technology publications. Generate a label that summarizes them. """
    
    for i in dat.index:
        inputs = tokenizer(dat['Representation'][i]+prompt, return_tensors='pt')
        inputs.to(device)
        generate_ids = model.generate(inputs.input_ids, max_length = 150, pad_token_id=tokenizer.eos_token_id)
        dat['GenTopic'][i] = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        
        dat['GenTopic'][i] = dat['GenTopic'][i].replace(dat['Representation'][i], "")
        dat['GenTopic'][i] = dat['GenTopic'][i].replace(prompt, "")
            
    dat.to_csv("QuanTech_Python/freq_"+period_name+"_gentopic_llama3_hur.csv")