In [1]:
import torch, textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


# Load model

In [2]:
MODEL_ID = "johnsnowlabs/JSL-MedLlama-3-8B-v2.0"
DEVICE   = "cuda" if torch.cuda.is_available() else "CPU"
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    return_tensors="pt",
    padding = True,
    truncation = True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, load_in_8bit = True, device_map = "auto")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

model = torch.compile(model)
model.eval()

2025-07-12 17:27:03.333745: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-12 17:27:03.365643: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-12 17:27:03.365670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-12 17:27:03.366698: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-12 17:27:03.372421: I tensorflow/core/platform/cpu_feature_guar

OptimizedModule(
  (_orig_mod): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
          (post_attention_layern

# Some medical text

In [3]:
medical_text = """
Non-steroidal anti-inflammatory drugs are not only potent analgesics and antipyretics but also nephrotoxins, and may cause 
electrolyte disarray. In addition to the commonly expected effects, including hyperkalemia, hyponatremia, acute renal injury, 
renal cortical necrosis, and volume retention, glomerular disease with or without nephrotic syndrome or nephritis can occur as 
well including after years of seemingly safe administration. Minimal change disease, secondary membranous glomerulonephritis, 
and acute interstitial nephritis are all reported glomerular lesions seen with non-steroidal anti-inflammatory use. We report a 
patient who used non-steroidal anti-inflammatory drugs for years without diabetes, chronic kidney disease, or proteinuria; he 
then developed severe nephrotic range proteinuria with 7 g of daily urinary protein excretion. Renal biopsy showed minimal 
change nephropathy, a likely secondary membranous glomerulonephritis, and acute interstitial nephritis present simultaneously
in one biopsy. 

"""

In [None]:
gen_cfg = GenerationConfig(
    max_new_tokens = 64,
    temperature = 0.1,
    top_p = 0.9,
    repetition_penalty = 1.1,
    do_sample = True,
    no_repeat_ngram_size = 6,
)
    
def summarize(medical_text):
    messages = [
        {
            "role": "user",
            "content": textwrap.dedent(f"""
                Below is an abstract from a medical paper.
    
                ```text
                {medical_text.strip()}
                ```
    
                **Task:** Produce a 20-word summary **and end with a full stop (.) when you are done.**
                Use clear, professional medical language.
                Don't include a greeting or introduction.
            """),
        }
    ]
    
    if tokenizer.chat_template is None:
        tokenizer.chat_template = textwrap.dedent("""
        <|im_start|>system
        You are a concise, professional medical writing assistant. <|im_end|>
        {% for m in messages %}
        <|im_start|>{{ m['role'] }}
        {{ m['content'] }}<|im_end|>
        {% endfor %}
        {% if add_generation_prompt %}<|im_start|>assistant
        {% endif %}
        """).strip()

    encoded = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        tokenize = True
    )
    
    batch = tokenizer.pad(
        [{"input_ids": encoded}],
        return_tensors = "pt",
        padding = True).to(model.device)
        
    with torch.no_grad():
        generated = model.generate(
            input_ids = batch['input_ids'], 
            generation_config = gen_cfg, 
            return_dict_in_generate = False, 
            max_new_tokens = 128,
            attention_mask  = batch["attention_mask"])
    
    summary = tokenizer.decode(generated[0], skip_special_tokens = True)
    response_text = textwrap.fill(summary, 90).split('im_start|>assistant')[-1]
    response_text = response_text.replace('<|im_end|>', '').replace("\n", " ").strip()
    return response_text

# Inference

In [5]:
summarize(medical_text)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`generation_config` default values have been modified to match model-specific defaults: {'use_cache': False, 'bos_token_id': 128000, 'eos_token_id': 128001}. If this is not desired, please set these values explicitly.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Non-steroidal anti-inflammatory drug use can lead to various glomerular diseases, including minimal change nephropathy, membranous glomerulo-nephritis, and acute interstitital nephritis, which may result in nephrotic syndrome..'

# Persistence

In [6]:
import pandas as pd
import csv
df = pd.read_csv('./mtsamples.csv')
df['transcription'] = df.transcription.astype(str)
df['description'] = df.description.astype(str)

In [7]:
df['med-llama-summary'] = df.transcription.apply(summarize)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
df.to_csv('mtsamples_with_llama.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)