In [60]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split


import pandas as pd
import ast
from bs4 import BeautifulSoup
import unicodedata

In [72]:
dataset = pd.read_csv('datasets/staging_test_set.csv')[['PMID', 'Abstract', 'EDAM Topics']]

dataset['EDAM Topics'] = dataset['EDAM Topics'].apply(ast.literal_eval)
dataset['Abstract'] = dataset['Abstract'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text(strip=True))

In [73]:
test_data = pd.read_csv('outputs/raw_model_outputs.csv').iloc[:25][['Abstract']]
for i in range(len(test_data)):
    test_data.loc[i, 'Abstract'] = unicodedata.normalize("NFKD", test_data.loc[i, 'Abstract'])

In [74]:
dataset[dataset['Abstract'].isin(test_data['Abstract'])]

Unnamed: 0,PMID,Abstract,EDAM Topics
615,26989192,Genetic variations affecting neural tube closu...,"[Zoology, Biology, Developmental biology, Drug..."
672,26750514,"Chocolate Pots hot springs (CP) is a unique, c...","[Sequence analysis, Drug metabolism, Microbiol..."


In [75]:
dataset = dataset[~dataset['Abstract'].isin(test_data['Abstract'])]

In [77]:
training_set = dataset.sample(n=1000, random_state=42)

In [97]:
## Prepare dataset for finetuning

with open('templates/prompt_template.txt', 'r') as template_file:
    template = template_file.read()
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    full_edam_topics = edam_file.readlines()

full_edam_topics = [topic.strip() for topic in full_edam_topics]
# Add EDAM topics to prompt template

formatted_topics = "\n".join(full_edam_topics)
template = template.replace("<topics>", formatted_topics)

In [98]:
text_data = []
for index, row in training_set.iterrows():
    training_string = "Human: "
    abstract = row['Abstract']
    edam_topics = row['EDAM Topics']
    
    prompt = template.replace('<abstract>', abstract)
    prompt = prompt.replace('<num_terms>', str(len(edam_topics)))

    training_string += prompt
    training_string += "Bot: " + ', '.join(edam_topics)
    
    text_data.append(training_string)


In [100]:
training_set['text'] = text_data

In [102]:
training_set.to_csv('datasets/llm-finetune-data-1000.csv', index=False)