In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os

# if torch.cuda.is_available():
#     torch.set_default_device("cuda")
# else:
#     torch.set_default_device("cpu")
model_type = 'gemma2b' # orca13b
model_id = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(f"nlp/model/{model_type}", device_map="cuda:1", torch_dtype=torch.bfloat16)
    
os.environ['HF_TOKEN'] = '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'
os.environ['HF_HOME'] = '/data_vault/hexai/huggingface/hub/'

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'])

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


In [2]:
import pandas as pd
data = "/data_vault/hexai/Biolaysum/biolaysumm2024_data/eLife_train.jsonl"
elife_train = pd.read_json(path_or_buf=data, lines=True)

In [3]:
from transformers import pipeline
orca_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_length = 13000,
    trust_remote_code=True,
    device_map="auto",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

In [4]:
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
llm = HuggingFacePipeline(pipeline = orca_pipeline, model_kwargs = {'temperature':0.75})

In [5]:
map_prompt_template = """
                      Write a concise lay term summary of this chunk of text from a medical article.
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

system_prompt =  f"Summarize in lay terms. For example\n\nArticle:{elife_train.loc[2529].article}\nLay Summary:{elife_train.loc[2529].lay_summary}" 
                     
    
combine_temp = """ 
                      Write a concise lay term summary of the following text delimited by triple backquotes.
                      ```{text}```

                      """



combine_prompt = PromptTemplate(
    template=system_prompt + combine_temp, input_variables=["text"]
)

In [6]:
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["lay_summary"] = record.get("lay_summary")

    return metadata


loader = JSONLoader(
    file_path="/data_vault/hexai/Biolaysum/biolaysumm2024_data/eLife_train.jsonl",
    jq_schema='.',
    content_key="article",
    metadata_func=metadata_func,
    json_lines=True
)

data = loader.load()

In [37]:
from langchain.chains import LLMChain

map_template = """Write a summary of the content below. Summarize 1.) Key ideas an 2.) Key findings using lay terms:

{content}

Summary:
"""
map_prompt = PromptTemplate.from_template(map_prompt_template)
map_chain = LLMChain(prompt=map_prompt, llm=llm)

In [54]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

reduce_template = """The following is set of summaries of a medical article:

{doc_summaries}

Summarize the above summaries. 
Summary:"""


reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(prompt=reduce_prompt, llm=llm)
stuff_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries")

In [67]:
from langchain.chains import ReduceDocumentsChain

reduce_chain = ReduceDocumentsChain(
    combine_documents_chain=stuff_chain,
    token_max=8000
)

In [68]:
from langchain.chains import MapReduceDocumentsChain

map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    document_variable_name="text",
    reduce_documents_chain=reduce_chain
)

In [74]:
x = data[3830]

In [75]:
from langchain.text_splitter import TokenTextSplitter

splitter = TokenTextSplitter(chunk_size=500)
split_docs = splitter.split_documents([x])

In [76]:
type(x)

langchain_core.documents.base.Document

In [77]:
for doc in split_docs:
    print(len(doc.page_content.split(" ")))

429
415
171


In [78]:
summary = map_reduce_chain.run(split_docs)
print(summary)



The following is set of summaries of a medical article:


                      Write a concise lay term summary of this chunk of text from a medical article.
                      Interoception , the sensitivity to visceral sensations , plays an important role in homeostasis and guiding motivated behaviour . It is also considered to be fundamental to self-awareness . Despite its importance , the developmental origins of interoceptive sensitivity remain unexplored . We here provide the first evidence for implicit , flexible interoceptive sensitivity in 5 month old infants using a novel behavioural measure , coupled with an established cortical index of interoceptive processing . These findings have important implications for the understanding of the early developmental stages of self-awareness , self-regulation and socio-emotional abilities . 
 Forty-one healthy , full-term infants were tested in total , at 5 months of age ( 19 males , mean age = 5 . 10 months , SD = 0 . 29 ) . The exp