In [5]:
from transformers import pipeline, AutoTokenizer
import pandas as pd

# Load data 
df = pd.read_csv("titles_abstracts_v2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,prs,abstract,tr_abs,abs_lang,title,tr_ti,ti_lang
0,1,iaa38a151fd,,Summary of thesis\r\n\t\t\t\t\tThis thesis exa...,1,fr,Communication and violence in Chad: the case o...,1,fr
1,2,if017a65e82,,This book includes three studies of E. Douwes ...,1,nl,The polygamous muse of E. Douwes Dekker,1,nl
2,3,ief93ffde58,,In this thesis different aspects of functional...,0,en,Liquid crystalline hydrogen-bonded rosettes,0,en
3,4,iad8b57cf2b,,Alexander Jackob proposes a new way of looking...,1,nl,Theater and visual experience in the eyes of t...,1,de
4,5,i417f59b9fe,,The German hand-reader Julius Spier played an...,0,en,The psychochirologist Julius Spier and handwri...,1,nl


In [None]:
# initialise the classifier and tokeniser
model_name = "OpenAlex/bert-base-multilingual-cased-finetuned-openalex-topic-classification-title-abstract"
classifier = pipeline(model=model_name, top_k=10)
tokenizer = AutoTokenizer.from_pretrained(model_name)

data = []

for idx, row in df.iterrows():
    title = row['title']
    abstract = row['abstract']
    id = row['id']

    input_text = f"<TITLE> {title}\n<ABSTRACT> {abstract}"
    
    # tokenise and truncate the input text to a maximum of 510 tokens to leave space for [CLS] and [SEP]
    tokens = tokenizer(input_text, truncation=True, max_length=510, return_tensors="pt")  
    
    try:
        # print the tokens - debug
        #print(f"Tokens for ID {id}: {tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])}")
        
        # decode tokens to string if needed (to show)
        input_text_truncated = tokenizer.decode(tokens["input_ids"][0])
        
        results = classifier(input_text_truncated)
        row_dict = {'id': id}

        # modify format
        for i, result in enumerate(results[0]):
            label = result['label']
            score = result['score']
            topic_id = int(label.split(':')[0])
            modified_topic_id = 10000 + topic_id  # add 10000 to change the ID format
            row_dict[f'topic{i+1}_id'] = modified_topic_id
            row_dict[f'topic{i+1}_score'] = score

        data.append(row_dict)

    except Exception as e:
        print(f"Error processing id {id}: {e}")

new_df = pd.DataFrame(data)
new_df.to_csv("classified_topics_v3.csv", index=False)
print(df.head())

# after tokenisation, tokens exceeding 512 tokens were usually about 514 (not too much data lost)
# truncated to 510 tokens (to ensure special tokens included)

In [9]:
len(new_df)
# should be 16304 like original df

16304