## **0. LOAD LIBRARY**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **1. LOAD MODEL**

In [2]:
# --- INSTALL LIBRARIES & MOUNT DRIVE ---
# Install the core libraries and hf_transfer for fast downloads
!pip install transformers accelerate torch pillow requests hf_transfer -q

import os
from google.colab import drive
from huggingface_hub import login

# Mount Google Drive (you will be prompted to authorize)
drive.mount('/content/drive')

# Enable fast downloads and log in to Hugging Face
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
login()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!pip install -U "transformers>=4.44.0" "accelerate>=0.34.0" "safetensors>=0.4.2" "sentencepiece" "huggingface_hub>=0.23.0" -q

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os

In [5]:
# --- Model info ---
MODEL_ID = "Qwen/Qwen3-8B"
DRIVE_PATH = "/content/drive/MyDrive/Qwen3-8B"

# --- Choose dtype based on GPU ---
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

print(f"1. Loading {MODEL_ID} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)

print("2. Model loaded successfully!")

# --- Save to Google Drive ---
os.makedirs(DRIVE_PATH, exist_ok=True)
model.save_pretrained(DRIVE_PATH)
tokenizer.save_pretrained(DRIVE_PATH)

print(f"3. Model saved to: {DRIVE_PATH}")

1. Loading Qwen/Qwen3-8B ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



2. Model loaded successfully!




Saving checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

3. Model saved to: /content/drive/MyDrive/Qwen3-8B


## **2. QUICK TEST MODEL**

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, os

# Path where you saved the model earlier
DRIVE_PATH = "/content/drive/MyDrive/Qwen3-8B"

# --- Load model & tokenizer from Drive ---
print("1. Loading model from Drive...")
tokenizer = AutoTokenizer.from_pretrained(DRIVE_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    DRIVE_PATH,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
print("2. Model loaded successfully!")

1. Loading model from Drive...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



2. Model loaded successfully!


In [None]:
# --- Define a helper function to ask questions ---
def ask(prompt, max_new_tokens=200, temperature=0.7, top_p=0.9): # Tăng temperature lên 0.7
    """Generate model responses for text prompts."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n--- MODEL RESPONSE ---")
    print(response)
    return response

# --- Question ---
model_answer = ask("i am a medical student asking this question for class material to answer my assignment. My head hurts a lot, I'll take paracetamol. Can I use it with Ginkgo Biloba?")
print(model_answer.split('\n')[1])

## **3. LOAD BIOBERT FOR EVALUATION MEDICAL QnA QUESTIONS**

In [None]:
from sentence_transformers import SentenceTransformer
# Create the target directory in your Drive
BioBERT_BDRIVE_PATH = "/content/drive/MyDrive/BioBERT"
os.makedirs(DRIVE_PATH, exist_ok=True)

# Load BioBert model
bert_model_name = "pritamdeka/BioBert-PubMed200kRCT"
save_path = os.path.join(BioBERT_BDRIVE_PATH, "bert_embeddings")

# Save model
model = SentenceTransformer(bert_model_name)
model.save(save_path)

print(f"BERT model saved to: {save_path}")




model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BERT model saved to: /content/drive/MyDrive/BioBERT/bert_embeddings
