In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
from accelerate import PartialState


# if torch.cuda.is_available():
#     torch.set_default_device("cuda")
# else:
#     torch.set_default_device("cpu")

os.environ['HF_TOKEN'] = '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'
os.environ['HF_HOME'] = '/data_vault/hexai/huggingface/hub/'

model_type = 'gemma-2b-it' # orca13b
model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'], use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto", token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME']
)

# model = AutoModelForCausalLM.from_pretrained(f"nlp/model/{model_type}", device_map="cuda:1", torch_dtype=torch.float16)
# tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'])

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
import pandas as pd
data = "/data_vault/hexai/Biolaysum/biolaysumm2024_data/eLife_val.jsonl"
elife_train = pd.read_json(path_or_buf=data, lines=True)

In [3]:
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    batch_size=4,
    max_new_tokens=1000,
    temperature = 0.3,
    do_sample=True,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [4]:
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
from langchain.chains import MapReduceDocumentsChain, LLMChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

In [6]:
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["lay_summary"] = record.get("lay_summary")

    return metadata




In [7]:
def load_json():
    # Load the pdf file
    loader = JSONLoader(
        file_path="/data_vault/hexai/Biolaysum/biolaysumm2024_data/eLife_val.jsonl",
        jq_schema='.',
        content_key="article",
        metadata_func=metadata_func,
        json_lines=True
    )

    documents = loader.load()

    token_count = num_tokens_from_string(str(documents), "cl100k_base")
    print(f'JSON Token Count: {token_count}')
    return documents, token_count


In [8]:
docs, counts = load_json()

JSON Token Count: 3475695


In [9]:
19585/512

38.251953125

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=100)

In [11]:
splits = text_splitter.create_documents([docs[40].page_content])

In [12]:
len(splits)

13

In [13]:
len(splits[0].page_content)

1131

In [14]:
from transformers import T5Tokenizer, T5EncoderModel
import torch


extractor = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="feature-extraction"
)

In [15]:
document_embeddings = []

In [21]:
import numpy as np
from tqdm import tqdm
from datetime import datetime
print("Starting summarization:",datetime.now()) 

for doc in tqdm(splits):
    embedd = extractor(doc.page_content)
    mean_embedd = np.array(embedd).mean(axis=1).squeeze(axis=0)
    document_embeddings.append(mean_embedd)

Starting summarization: 2024-04-21 21:40:05.660516


 77%|███████████████████████████████████████████████████████████████████▋                    | 10/13 [03:22<00:57, 19.33s/it]--- Logging error ---
Traceback (most recent call last):
  File "/home/ngl18/anaconda3/envs/lora/lib/python3.12/logging/__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/home/ngl18/anaconda3/envs/lora/lib/python3.12/logging/__init__.py", line 999, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/home/ngl18/anaconda3/envs/lora/lib/python3.12/logging/__init__.py", line 703, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/home/ngl18/anaconda3/envs/lora/lib/python3.12/logging/__init__.py", line 392, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _r

In [22]:
nump_embedd = np.array(document_embeddings)

In [23]:
nump_embedd.shape

(13, 256000)

In [24]:
# Assuming 'embeddings' is a list or array of 1536-dimensional embeddings

# Choose the number of clusters, this can be adjusted based on the book's content.
# I played around and found ~10 was the best.
# Usually if you have 10 passages from a book you can tell what it's about
num_clusters = 10

from sklearn.cluster import KMeans
# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(nump_embedd)


In [25]:
# Find the closest embeddings to the centroids

# Create an empty list that will hold your closest points
closest_indices = []

# Loop through the number of clusters you have
for i in range(num_clusters):
    
    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(nump_embedd - kmeans.cluster_centers_[i], axis=1)
    
    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)
    
    # Append that position to your closest indices list
    closest_indices.append(closest_index)

In [26]:
selected_indices = sorted(closest_indices)
selected_indices

[1, 2, 3, 4, 6, 8, 9, 10, 11, 12]

In [27]:
map_prompt  = """Given the text enclosed in triple backticks (```) provide a condensed summary in layterms:

```{text}```

SUMMARY:
"""

In [28]:
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])


In [29]:
from langchain.chains import load_summarize_chain
map_chain = load_summarize_chain(llm=llm,
                                 chain_type="stuff",
                                 prompt=map_prompt_template)

In [30]:
selected_docs = [splits[doc] for doc in selected_indices]


In [31]:
def post_process_summaries(summary):
    return summary.split("SUMMARY:\n")[-1]

In [32]:
# Make an empty list to hold your summaries
summary_list = []

# Loop through a range of the lenght of your selected docs
for i, doc in enumerate(selected_docs):
    
    # Go get a summary of the chunk
    chunk_summary = map_chain.run([doc])
    
    # Append that summary to your list
    summary_list.append(chunk_summary)
    
    print (f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {post_process_summaries(chunk_summary)[:250]}\n")

  warn_deprecated(


Summary #0 (chunk #1) - Preview: The monkey with the long-standing primary visual cortex lesion had residual visual function, which was determined to be mediated by the LGN, pulvinar, SC, and extrastriate visual cortical networks. The results suggest that the structural and function

Summary #1 (chunk #2) - Preview: The LGN is often reduced in size, due to retrograde degeneration ( Miki et al . , 2005; Bridge et al . , 2011 ) when humans with lesions of V1 have hemianopia. A similar result has also been found in the adult marmoset ( Atapour et al . , 2017 ) . Wh

Summary #2 (chunk #3) - Preview: The cortical changes in monkey S were investigated by acquiring post mortem T2-weighted images. Measurement of cortical thickness in V1 in both monkeys indicated that monkey S has substantially thinner cortex around the lesion. By contrast, extrastri

Summary #3 (chunk #4) - Preview: The timeseries analysis of the BOLD signal in the LGN and the pulvinar of monkey S revealed that there was no sp

In [33]:
from langchain.schema import Document

summaries = "\n".join([post_process_summaries(summ) for summ in summary_list])

# Convert it back to a document
summaries = Document(page_content=summaries)

print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")

Token indices sequence length is longer than the specified maximum sequence length for this model (1070 > 1024). Running this sequence through the model will result in indexing errors


Your total summary has 1070 tokens


In [34]:
summaries

Document(page_content='The monkey with the long-standing primary visual cortex lesion had residual visual function, which was determined to be mediated by the LGN, pulvinar, SC, and extrastriate visual cortical networks. The results suggest that the structural and functional networks underlying residual visual function are largely intact in this monkey.\nThe LGN is often reduced in size, due to retrograde degeneration ( Miki et al . , 2005; Bridge et al . , 2011 ) when humans with lesions of V1 have hemianopia. A similar result has also been found in the adult marmoset ( Atapour et al . , 2017 ) . When we investigated the structure of the LGN in humans with lesions of V1, we found that the LGN was completely absent in one case, and that the LGN was significantly reduced in size in another case. These results suggest that the LGN is a critical structure for visual processing in humans with lesions of V1.\nThe cortical changes in monkey S were investigated by acquiring post mortem T2-wei

In [35]:
combine_prompt = """
Given the following text write a condensed and precise paragraph summarizing the relevant points. Remove irrelevant information. 

```{text}```

SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [36]:
llm2 = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [37]:
reduce_chain = load_summarize_chain(llm=llm2,
                             chain_type="stuff",
                             prompt=combine_prompt_template)

In [38]:
output = reduce_chain.run([summaries])

In [39]:
output = post_process_summaries(output)

In [40]:
print(output)

The monkey with the long-standing primary visual cortex lesion has residual visual function, which is mediated by the LGN, pulvinar, SC, and extrastriate visual cortical networks. The results suggest that the structural and functional networks underlying residual visual function are largely intact in this monkey.


In [41]:
print("Ending summarization:" datetime.now()) 

SyntaxError: invalid syntax. Perhaps you forgot a comma? (589089834.py, line 1)