In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

import json

import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [7]:
dataset = pd.read_csv('datasets/staging_test_set.csv')

In [8]:
with open('templates/prompt_template.txt', 'r') as template_file:
    template = template_file.read()

In [9]:
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    full_edam_topics = edam_file.readlines()

full_edam_topics = [topic.strip() for topic in full_edam_topics]

In [10]:
# Add EDAM topics to prompt template

formatted_topics = "\n".join(full_edam_topics)
template = template.replace("<topics>", formatted_topics)


## Testing Meditron 7b 

Note: You need a lot of RAM for this

In [11]:
with open("config.json", "r") as config_file:
    config = json.load(config_file)

In [12]:
hf_token = config['api_keys']['huggingface']

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("epfl-llm/meditron-7b", token=hf_token)

tokenizer = AutoTokenizer.from_pretrained("./meditron-7b-tokenizer")

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("epfl-llm/meditron-70b", token=hf_token)
# model = AutoModelForCausalLM.from_pretrained("epfl-llm/meditron-7b", token=hf_token)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("./meditron-7b-model")
model

In [None]:
model = model.to_bettertransformer()

In [None]:
# model.save_pretrained("meditron-7b-model", from_pt=True)
# tokenizer.save_pretrained("meditron-7b-tokenizer", from_pt=True)

In [13]:
random_sample = dataset.sample(n=1)
random_sample

Unnamed: 0,PMID,Description,Abstract,MeSH Terms,Filtered MeSH Terms,EDAM Topics
70,19043404,The human distal gut harbours a vast ensemble ...,The human distal gut harbours a vast ensemble ...,"['Adult', 'Africa/ethnology', 'Biodiversity', ...","['Adult', 'Africa', 'Biodiversity', 'Dizygotic...","['Biodiversity', 'Genotype and phenotype', 'Hu..."


In [14]:
prompt = template.replace('<abstract>', random_sample['Abstract'].values[0])
prompt


'An abstract associated with a scientific dataset is quoted below:\n\n"The human distal gut harbours a vast ensemble of microbes (the microbiota) that provide important metabolic capabilities, including the ability to extract energy from otherwise indigestible dietary polysaccharides. Studies of a few unrelated, healthy adults have revealed substantial diversity in their gut communities, as measured by sequencing 16S rRNA genes, yet how this diversity relates to function and to the rest of the genes in the collective genomes of the microbiota (the gut microbiome) remains obscure. Studies of lean and obese mice suggest that the gut microbiota affects energy balance by influencing the efficiency of calorie harvest from the diet, and how this harvested energy is used and stored. Here we characterize the faecal microbial communities of adult female monozygotic and dizygotic twin pairs concordant for leanness or obesity, and their mothers, to address how host genotype, environmental exposur

In [None]:
model_inputs = tokenizer(template, padding=True, return_tensors="pt")

In [None]:
model_inputs

In [None]:
outputs = model(**model_inputs)

In [None]:
predicted_label_classes = outputs.logits.argmax(-1)
predicted_label_classes

In [None]:
tokenizer.decode(outputs)

## Inference API

Using huggingface inference API

In [135]:
import requests

In [149]:
with open('templates/prompt_template copy.txt', 'r') as template_file:
    template = template_file.read()

formatted_topics = "\n".join(full_edam_topics)
template = template.replace("<topics>", formatted_topics)


In [150]:
random_sample = dataset.sample(n=1)
random_sample

Unnamed: 0,PMID,Description,Abstract,MeSH Terms,Filtered MeSH Terms,EDAM Topics
2478,28115466,Expression analysis of wt and maternal Trim28 ...,Global DNA demethylation is a hallmark of embr...,"['Animals', 'Cells, Cultured', 'Cellular Repro...","['Animals', 'Cells', 'Cellular Reprogramming',...","['Zoology', 'Developmental biology', 'Gene exp..."


In [151]:
prompt = template.replace('<abstract>', random_sample['Abstract'].values[0])
prompt = prompt.replace('<num_terms>', '5')
prompt


'An abstract associated with a scientific dataset is quoted below:\n\n"Global DNA demethylation is a hallmark of embryonic epigenetic reprogramming. However, embryos engage noncanonical DNA methylation maintenance mechanisms to ensure inheritance of exceptional epigenetic germline features to the soma. Besides the paradigmatic genomic imprints, these exceptions remain ill-defined, and the mechanisms ensuring demethylation resistance in the light of global reprogramming remain poorly understood. Here we show that the Y-linked gene Rbmy1a1 is highly methylated in mature sperm and resists DNA demethylation post-fertilization. Aberrant hypomethylation of the Rbmy1a1 promoter results in its ectopic activation, causing male-specific peri-implantation lethality. Rbmy1a1 is a novel target of the TRIM28 complex, which is required to protect its repressive epigenetic state during embryonic epigenetic reprogramming."\n\nA subset of the following list of EDAM topics are assigned to the abstract as

In [152]:
headers = {"Authorization": f"Bearer {hf_token}"}

def query(api_url, payload):
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()

In [153]:
# deepset/roberta-base-squad2

# Falconsai/medical_summarization

# HuggingFaceH4/zephyr-7b-beta
# mistralai/Mistral-7B-v0.1

API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"

In [154]:
data = query(API_URL, {"inputs": f"{prompt}"})
data

[{'generated_text': 'An abstract associated with a scientific dataset is quoted below:\n\n"Global DNA demethylation is a hallmark of embryonic epigenetic reprogramming. However, embryos engage noncanonical DNA methylation maintenance mechanisms to ensure inheritance of exceptional epigenetic germline features to the soma. Besides the paradigmatic genomic imprints, these exceptions remain ill-defined, and the mechanisms ensuring demethylation resistance in the light of global reprogramming remain poorly understood. Here we show that the Y-linked gene Rbmy1a1 is highly methylated in mature sperm and resists DNA demethylation post-fertilization. Aberrant hypomethylation of the Rbmy1a1 promoter results in its ectopic activation, causing male-specific peri-implantation lethality. Rbmy1a1 is a novel target of the TRIM28 complex, which is required to protect its repressive epigenetic state during embryonic epigenetic reprogramming."\n\nA subset of the following list of EDAM topics are assigne

In [155]:
print(data[0]['generated_text'])

An abstract associated with a scientific dataset is quoted below:

"Global DNA demethylation is a hallmark of embryonic epigenetic reprogramming. However, embryos engage noncanonical DNA methylation maintenance mechanisms to ensure inheritance of exceptional epigenetic germline features to the soma. Besides the paradigmatic genomic imprints, these exceptions remain ill-defined, and the mechanisms ensuring demethylation resistance in the light of global reprogramming remain poorly understood. Here we show that the Y-linked gene Rbmy1a1 is highly methylated in mature sperm and resists DNA demethylation post-fertilization. Aberrant hypomethylation of the Rbmy1a1 promoter results in its ectopic activation, causing male-specific peri-implantation lethality. Rbmy1a1 is a novel target of the TRIM28 complex, which is required to protect its repressive epigenetic state during embryonic epigenetic reprogramming."

A subset of the following list of EDAM topics are assigned to the abstract as a co