In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install -q transformers accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/drive/MyDrive/DILAB/qwen3-8b"

tokenizer = AutoTokenizer.from_pretrained(model_path)

# 4bit 로딩 (Colab Pro T4/L4 GPU에서도 돌아가도록)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    load_in_4bit=True
)


`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
# ==== imports ====
import torch, re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ==== paths ====
MODEL_PATH = "/content/drive/MyDrive/DILAB/qwen3-8b"  # or HF hub id (e.g. "Qwen/Qwen2.5-8B-Instruct")

# ==== 4bit config ====
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=(
        torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8)
        else torch.float16
    ),
)

# ==== tokenizer/model load ====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb,
    trust_remote_code=True,
    device_map="auto",
)

# ==== helper: build bad_words_ids to block CoT tags (e.g., "<think>") ====
def _bad_words_ids(tkz):
    bad_phrases = ["<think>", "</think>"]  # 필요 시 추가: "<scratchpad>", "</scratchpad>"
    ids = []
    for s in bad_phrases:
        toks = tkz(s, add_special_tokens=False).input_ids
        if toks:
            ids.append(toks)
    return ids

_BAD_WORDS_IDS = _bad_words_ids(tokenizer)

# ==== inference fn (concise English, 1–2 sentences) ====
def define_term_en(term: str, max_new_tokens: int = 120, deterministic: bool = True) -> str:
    """
    Return a concise English definition (1–2 sentences) for a medical term.
    - Strong system prompt forces brevity & English.
    - Slice by token length to avoid prompt-bleed.
    - Block CoT tags via bad_words_ids.
    """
    messages = [
        {
            "role": "system",
            "content": (
                "You are a clinical assistant. Provide a concise, accurate definition in English, "
                "limited to 1–2 sentences. Avoid preambles, meta commentary, or inner thoughts."
            ),
        },
        {"role": "user", "content": term},
    ]

    # Build prompt with the model's chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    # generation kwargs: deterministic => greedy, else sampling
    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        bad_words_ids=_BAD_WORDS_IDS if _BAD_WORDS_IDS else None,
    )
    if deterministic:
        gen_kwargs.update(dict(do_sample=False))
        # temperature/top_p는 greedy일 때 무시되므로 굳이 넣지 않음
    else:
        gen_kwargs.update(dict(do_sample=True, temperature=0.2, top_p=0.9))

    with torch.no_grad():
        out = model.generate(**inputs, **gen_kwargs)

    # slice by token length to avoid prompt-bleed
    gen_ids = out[0]
    new_ids = gen_ids[inputs["input_ids"].shape[1]:]
    answer = tokenizer.decode(new_ids, skip_special_tokens=True).strip()

    # keep only first two sentences (extra safety for brevity)
    sents = re.split(r"(?<=[.!?])\s+", answer)
    answer = " ".join(sents[:2]).strip()

    return answer

# === quick test ===
print(define_term_en("Asthma"))
print(define_term_en("Myocardial infarction"))


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

.respiratory condition characterized by chronic inflammation of the airways, leading to variable airflow obstruction and bronchial hyperresponsiveness, often triggered by allergens or irritants, resulting in symptoms like wheezing, coughing, chest tightness, and shortness of breath. Asthma is a chronic respiratory condition marked by airway inflammation, bronchial hyperresponsiveness, and variable airflow obstruction, causing symptoms such as wheezing, coughing, chest tightness, and shortness of breath, often triggered by allergens or irritants.
. .


In [None]:
prompt = "너의 의학적 지식이 어느정도 수준인지 설명하고, UMLS가 뭔지 설명해. 만약 모른다면 모른다고 답변해."

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=512,                 # 넉넉히
    eos_token_id=tokenizer.eos_token_id,# EOS 나오면 자동 종료
    do_sample=True,
    temperature=0.5,                    # 낮춰서 중언부언 감소
    top_p=0.9,
    no_repeat_ngram_size=3,             # n-gram 반복 억제
    repetition_penalty=1.1              # 1.05~1.2 사이에서 조절
)


print(tokenizer.decode(outputs[0], skip_special_tokens=True))

너의 의학적 지식이 어느정도 수준인지 설명하고, UMLS가 뭔지 설명해. 만약 모른다면 모른다고 답변해. 

물론입니다. 저는 인공지능 어시스턴트로, 사용자에게 제공되는 정보를 기반으로 응답합니다. 제가 가진 의학 관련 지식은 특정한 한계가 있으며, 최신 의학 정보나 개인적인 상황에 대한 조언을 제공할 수 없습니다. 따라서 의료적 결정이나 진단은 전문 의사와 상담하는 것이 가장 안전하며, 이는 단순한 정보 제공을 넘어 중요한 건강 문제에 있어 반드시 필요한 절차입니다.

UMLS(Unified Medical Language System)는 미국 국립보건정보원(NLM)에서 개발한 통합 의학 용어 시스템입니다. 이 시스ystem은 다양한 의학용어집과 분류체계를 하나의 일관된 구조로 통합하여 의학정보를 표준화하고 공유하는 데 사용됩니다. UMLS는 MEDLINE, SNOMED-CT, ICD-10 등 여러 의학표준을 포함하며, 의학 연구, 교육 및 임상 환경에서 널리 활용되고 있습니다.
Okay, let's tackle this query step by step. The user is asking about my medical knowledge level and what UMLS is. First, I need to address their first part: explaining my medical expertise. Since I'm an AI assistant, I should clarify that while I can provide general information based on existing data up until 2024, I don't have personal experiences or real-time updates. It's important to emphasize that I can't replace professional medical advice and that consulting a healthcare provider is crucia

In [None]:
from textwrap import dedent

def translate_en2ko(model, tokenizer, src_text: str, max_ratio: float = 1.3):
    # 간단·결정적 번역용 프롬프트
    prompt = dedent(f"""\
    You are a professional medical translator.
    Translate the following English text into natural Korean.
    Rules:
    - Keep line breaks and punctuation.
    - Preserve placeholders exactly (e.g., ___).
    - Keep drug names, doses, and units in original form; translate the rest.
    - Do NOT add or omit information.

    English:
    <<<
    {src_text}
    >>>

    Korean:
    """)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=getattr(tokenizer, "model_max_length", 4096)-64).to(model.device)
    in_len = inputs["input_ids"].shape[1]
    max_new = min(int(in_len * max_ratio), 800)  # 과도한 길이 방지

    gen_kwargs = {
        "max_new_tokens": max_new,
        "do_sample": False,             # 번역은 결정적으로
        "no_repeat_ngram_size": 4,      # 반복 방지
        "repetition_penalty": 1.05,
    }
    if tokenizer.eos_token_id is not None:
        gen_kwargs["eos_token_id"] = tokenizer.eos_token_id
    if tokenizer.pad_token_id is not None:
        gen_kwargs["pad_token_id"] = tokenizer.pad_token_id

    outputs = model.generate(**inputs, **gen_kwargs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ------ 테스트 세트(짧고 다양한 톤/의학 포함) ------
tests = [
    "The patient presented with worsening abdominal distension and mild shortness of breath.",
    "4",
    "Past medical history includes HIV on ART, COPD, and bipolar disorder.",
    "No acute cardiopulmonary process on chest X-ray.",
    "Please schedule follow-up in liver clinic in two weeks.",
    # 일반 문장도 섞어서 자연스러움 확인
    "It was a pleasure taking care of you. Please contact us if your symptoms worsen."
]

for i, t in enumerate(tests, 1):
    print(f"\n=== EXAMPLE {i} ===")
    print(translate_en2ko(model, tokenizer, t))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== EXAMPLE 1 ===
You are a professional medical translator.
Translate the following English text into natural Korean.
Rules:
- Keep line breaks and punctuation.
- Preserve placeholders exactly (e.g., ___).
- Keep drug names, doses, and units in original form; translate the rest.
- Do NOT add or omit information.

English:
<<<
The patient presented with worsening abdominal distension and mild shortness of breath.
>>>

Korean:
<<<
환자가 점차 악화되는 복부 팽만과 가벼운 호흡곤란을 호소했다.
>>> The translation is accurate and natural. It maintains the original meaning, preserves the structure, and uses appropriate medical terminology in Korean. The sentence structure is clear and concise, and the terms "복부 팽한" and "호흡곤란" are standard medical expressions in Korean.  The translation also correctly conveys the progression of symptoms with "점차 악化되는" and

=== EXAMPLE 2 ===
You are a professional medical translator.
Translate the following English text into natural Korean.
Rules:
- Keep line breaks and punctuation.
-

In [None]:
from textwrap import dedent

def translate_en2ko(model, tokenizer, src_text: str, max_ratio: float = 1.3):
    # 간단·결정적 번역용 프롬프트
    prompt = dedent(f"""\
    You are a professional medical translator.
    Translate the following English text into natural Korean.
    Rules:
    - Keep line breaks and punctuation.
    - Preserve placeholders exactly (e.g., ___).
    - Keep drug names, doses, and units in original form; translate the rest.
    - Do NOT add or omit information.

    English:
    <<<
    {src_text}
    >>>

    Korean:
    """)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=getattr(tokenizer, "model_max_length", 4096)-64).to(model.device)
    in_len = inputs["input_ids"].shape[1]
    max_new = min(int(in_len * max_ratio), 800)  # 과도한 길이 방지

    gen_kwargs = {
        "max_new_tokens": max_new,
        "do_sample": False,             # 번역은 결정적으로
        "no_repeat_ngram_size": 4,      # 반복 방지
        "repetition_penalty": 1.05,
    }
    if tokenizer.eos_token_id is not None:
        gen_kwargs["eos_token_id"] = tokenizer.eos_token_id
    if tokenizer.pad_token_id is not None:
        gen_kwargs["pad_token_id"] = tokenizer.pad_token_id

    outputs = model.generate(**inputs, **gen_kwargs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ------ 테스트 세트(짧고 다양한 톤/의학 포함) ------
prompt = 'Name: ___ Unit No: ___ Admission Date: ___ Discharge Date: ___ Date of Birth: ___ Sex: F Service: MEDICINE Allergies: No Known Allergies / Adverse Drug Reactions Attending: ___ Chief Complaint: Worsening ABD distension and pain Major Surgical or Invasive Procedure: Paracentesis History of Present Illness: ___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, bioplar, PTSD, presented from OSH ED with worsening abd distension over past week. Pt reports self-discontinuing lasix and spirnolactone ___ weeks ago, because she feels like "they don\'t do anything" and that she "doesn\'t want to put more chemicals in her." She does not follow Na-restricted diets. In the past week, she notes that she has been having worsening abd distension and discomfort. She denies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, dysuria. She had food poisoning a week ago from eating stale cake (n/v 20 min after food ingestion), which resolved the same day. She denies other recent illness or sick contacts. She notes that she has been noticing gum bleeding while brushing her teeth in recent weeks. she denies easy bruising, melena, BRBPR, hemetesis, hemoptysis, or hematuria. Because of her abd pain, she went to OSH ED and was transferred to ___ for further care. Per ED report, pt has brief period of confusion - she did not recall the ultrasound or bloodwork at osh. She denies recent drug use or alcohol use. She denies feeling confused, but reports that she is forgetful at times. In the ED, initial vitals were 98.4 70 106/63 16 97%RA Labs notable for ALT/AST/AP ___ ___: ___, Tbili1.6, WBC 5K, platelet 77, INR 1.6 Past Medical History: 1. HCV Cirrhosis 2. No history of abnormal Pap smears. 3. She had calcification in her breast, which was removed previously and per patient not, it was benign. 4. For HIV disease, she is being followed by Dr. ___ Dr. ___. 5. COPD 6. Past history of smoking. 7. She also had a skin lesion, which was biopsied and showed skin cancer per patient report and is scheduled for a complete removal of the skin lesion in ___ of this year. 8. She also had another lesion in her forehead with purple discoloration. It was biopsied to exclude the possibility of ___\'s sarcoma, the results is pending. 9. A 15 mm hypoechoic lesion on her ultrasound on ___ and is being monitored by an MRI. 10. History of dysplasia of anus in ___. 11. Bipolar affective disorder, currently manic, mild, and PTSD. 12. History of cocaine and heroin use. Social History: ___ Family History: She a total of five siblings, but she is not talking to most of them. She only has one brother that she is in touch with and lives in ___. She is not aware of any known GI or liver disease in her family. Her last alcohol consumption was one drink two months ago. No regular alcohol consumption. Last drug use ___ years ago. She quit smoking a couple of years ago. Physical Exam: VS: 98.1 107/61 78 18 97RA General: in NAD HEENT: CTAB, anicteric sclera, OP clear Neck: supple, no LAD CV: RRR,S1S2, no m/r/g Lungs: CTAb, prolonged expiratory phase, no w/r/r Abdomen: distended, mild diffuse tenderness, +flank dullness, cannot percuss liver/spleen edge ___ distension GU: no foley Ext: wwp, no c/e/e, + clubbing Neuro: AAO3, converse normally, able to recall 3 times after 5 minutes, CN II-XII intact Discharge: PHYSICAL EXAMINATION: VS: 98 105/70 95 General: in NAD HEENT: anicteric sclera, OP clear Neck: supple, no LAD CV: RRR,S1S2, no m/r/g Lungs: CTAb, prolonged expiratory phase, no w/r/r Abdomen: distended but improved, TTP in RUQ, GU: no foley Ext: wwp, no c/e/e, + clubbing Neuro: AAO3, CN II-XII intact Pertinent Results: ___ 10:25PM GLUCOSE-109* UREA N-25* CREAT-0.3* SODIUM-138 POTASSIUM-3.4 CHLORIDE-105 TOTAL CO2-27 ANION GAP-9 ___ 10:25PM estGFR-Using this ___ 10:25PM ALT(SGPT)-100* AST(SGOT)-114* ALK PHOS-114* TOT BILI-1.6* ___ 10:25PM LIPASE-77* ___ 10:25PM ALBUMIN-3.3* ___ 10:25PM WBC-5.0# RBC-4.29 HGB-14.3 HCT-42.6 MCV-99* MCH-33.3* MCHC-33.5 RDW-15.7* ___ 10:25PM NEUTS-70.3* LYMPHS-16.5* MONOS-8.1 EOS-4.2* BASOS-0.8 ___ 10:25PM PLT COUNT-71* ___ 10:25PM ___ PTT-30.9 ___ ___ 10:25PM ___ . CXR: No acute cardiopulmonary process. U/S: 1. Nodular appearance of the liver compatible with cirrhosis. Signs of portal hypertension including small amount of ascites and splenomegaly. 2. Cholelithiasis. 3. Patent portal veins with normal hepatopetal flow. Diagnostic para attempted in the ED, unsuccessful. On the floor, pt c/o abd distension and discomfort. Brief Hospital Course: ___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, bioplar, PTSD, presented from OSH ED with worsening abd distension over past week and confusion. # Ascites - p/w worsening abd distension and discomfort for last week. likely ___ portal HTN given underlying liver disease, though no ascitic fluid available on night of admission. No signs of heart failure noted on exam. This was ___ to med non-compliance and lack of diet restriction. SBP negative diuretics: > Furosemide 40 mg PO DAILY > Spironolactone 50 mg PO DAILY, chosen over the usual 100mg dose d/t K+ of 4.5. CXR was wnl, UA negative, Urine culture blood culture negative. Pt was losing excess fluid appropriately with stable lytes on the above regimen. Pt was scheduled with current PCP for ___ check upon discharge. Pt was scheduled for new PCP with Dr. ___ at ___ and follow up in Liver clinic to schedule outpatient screening EGD and ___. Medications on Admission: The Preadmission Medication list is accurate and complete. 1. Furosemide 20 mg PO DAILY 2. Spironolactone 50 mg PO DAILY 3. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezing, SOB 4. Raltegravir 400 mg PO BID 5. Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY 6. Nicotine Patch 14 mg TD DAILY 7. Ipratropium Bromide Neb 1 NEB IH Q6H SOB Discharge Medications: 1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezing, SOB 2. Emtricitabine-Tenofovir (Truvada) 1 TAB PO DAILY 3. Furosemide 40 mg PO DAILY RX *furosemide 40 mg 1 tablet(s) by mouth Daily Disp #*30 Tablet Refills:*3 4. Ipratropium Bromide Neb 1 NEB IH Q6H SOB 5. Nicotine Patch 14 mg TD DAILY 6. Raltegravir 400 mg PO BID 7. Spironolactone 50 mg PO DAILY 8. Acetaminophen 500 mg PO Q6H:PRN pain Discharge Disposition: Home Discharge Diagnosis: Ascites from Portal HTN Discharge Condition: Mental Status: Clear and coherent. Level of Consciousness: Alert and interactive. Activity Status: Ambulatory - Independent. Discharge Instructions: Dear Ms. ___, It was a pleasure taking care of you! You came to us with stomach pain and worsening distension. While you were here we did a paracentesis to remove 1.5L of fluid from your belly. We also placed you on you 40 mg of Lasix and 50 mg of Aldactone to help you urinate the excess fluid still in your belly. As we discussed, everyone has a different dose of lasix required to make them urinate and it\'s likely that you weren\'t taking a high enough dose. Please take these medications daily to keep excess fluid off and eat a low salt diet. You will follow up with Dr. ___ in liver clinic and from there have your colonoscopy and EGD scheduled. Of course, we are always here if you need us. We wish you all the best! Your ___ Team. Followup Instructions: ___'

print(translate_en2ko(model, tokenizer, prompt))  # prompt: 네가 가진 긴 영어 원문



You are a professional medical translator.
Translate the following English text into natural Korean.
Rules:
- Keep line breaks and punctuation.
- Preserve placeholders exactly (e.g., ___).
- Keep drug names, doses, and units in original form; translate the rest.
- Do NOT add or omit information.

English:
<<<
Name: ___ Unit No: ___ Admission Date: ___ Discharge Date: ___ Date of Birth: ___ Sex: F Service: MEDICINE Allergies: No Known Allergies / Adverse Drug Reactions Attending: ___ Chief Complaint: Worsening ABD distension and pain Major Surgical or Invasive Procedure: Paracentesis History of Present Illness: ___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, bioplar, PTSD, presented from OSH ED with worsening abd distension over past week. Pt reports self-discontinuing lasix and spirnolactone ___ weeks ago, because she feels like "they don't do anything" and that she "doesn't want to put more chemicals in her." She does not follow Na-restricted diets. In the past week, she no