In [1]:
import os, re, glob
import torch, numpy as np, pandas as pd, sklearn
import transformers, datasets, evaluate, spacy, sacrebleu
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss
from rouge_score import rouge_scorer
import xml.etree.ElementTree as ET
from urllib.parse import urlparse

print("--- System and Library Status ---")
print(f"Python OK: pandas {pd.__version__}, numpy {np.__version__}, sklearn {sklearn.__version__}")

# 1. PyTorch & GPU (M4 Mac) Check
print(f"Torch Version: {torch.__version__}")
mps_available = torch.backends.mps.is_available()
print(f"MPS (M4 GPU) available: {mps_available}")
# Define device to ensure all models are tested for the right hardware
DEVICE = torch.device("mps" if mps_available else "cpu")
print(f"Using Device: {DEVICE}")

# 2. Base Model and Tokenizer Check (Abstractive QA Task)
print("\n--- Task 1/3: Generation (T5) Check ---")
try:
    tok = T5TokenizerFast.from_pretrained("t5-small")
    mdl = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE) # Move model to GPU/MPS
    
    # Test generation execution flow
    test_question = "What is the primary treatment for glaucoma?"
    x = tok(test_question, return_tensors="pt", truncation=True).input_ids.to(DEVICE)
    y = mdl.generate(x, max_new_tokens=10)
    print("T5 Model Load & Generation OK ->", tok.decode(y[0], skip_special_tokens=True))
except Exception as e:
    print(f"T5 Model Test FAILED: {e}")

# 3. Embedding Model Check (Paraphrase/Dense RAG)
print("\n--- Task 2: Paraphrase/Embedding Check ---")
try:
    st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=str(DEVICE))
    sample_texts = ["leukemia symptoms", "blood cancer indicators"]
    # Encoding should successfully run on the determined DEVICE
    vecs = st_model.encode(sample_texts, convert_to_tensor=True)
    print(f"SBERT Model Load & Encode OK: Generated {len(vecs)} vectors on {vecs.device}")
except Exception as e:
    print(f"SBERT Model Test FAILED: {e}")

# 4. Retrieval Indexing Check (RAG Task)
print("\n--- Task 3: Retrieval Index Check ---")
try:
    # A. BM25 (Lexical) Check
    test_corpus = [["adult", "acute", "leukemia"], ["pediatric", "flu", "treatment"], ["heart", "disease", "diagnosis"]]
    bm = BM25Okapi(test_corpus)
    scores = bm.get_scores(["leukemia", "acute"])
    print("BM25 Load & Scoring OK.")

    # B. FAISS (Dense) Check
    # Use embedding dimension (384 for MiniLM-L6-v2) for index creation
    D_size = 384
    index = faiss.IndexFlatIP(D_size)
    # Use synthetic random vectors for testing FAISS (np must be loaded)
    synthetic_vecs = np.random.randn(10, D_size).astype('float32')
    index.add(synthetic_vecs)
    print(f"FAISS Load & Indexing OK: {index.ntotal} vectors indexed.")
except Exception as e:
    print(f"Retrieval Indexing FAILED: {e}")

# 5. Metric Check (Evaluation)
print("\n--- Evaluation Metrics Check ---")
try:
    _ = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    _ = evaluate.load("bertscore")
    _ = sacrebleu.corpus_bleu(["test"], [["test"]]) # Simple sacreBLEU call
    print("All Evaluation Metrics (ROUGE, BERTScore, sacreBLEU) Load OK.")
except Exception as e:
    print(f"Metric Loading FAILED: {e}")

print("\nALL PROJECT COMPONENTS READY ✅")

  from .autonotebook import tqdm as notebook_tqdm


--- System and Library Status ---
Python OK: pandas 2.3.3, numpy 2.2.6, sklearn 1.7.2
Torch Version: 2.8.0
MPS (M4 GPU) available: True
Using Device: mps

--- Task 1/3: Generation (T5) Check ---
T5 Model Load & Generation OK -> Welches ist der primäre Behandlung für Gla

--- Task 2: Paraphrase/Embedding Check ---


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


SBERT Model Load & Encode OK: Generated 2 vectors on mps:0

--- Task 3: Retrieval Index Check ---
BM25 Load & Scoring OK.
FAISS Load & Indexing OK: 10 vectors indexed.

--- Evaluation Metrics Check ---
All Evaluation Metrics (ROUGE, BERTScore, sacreBLEU) Load OK.

ALL PROJECT COMPONENTS READY ✅
