In [1]:
import torch
from transformers import AutoModel

MODEL_NAME = "bert-large-uncased-whole-word-masking-finetuned-squad"

# Check available devices
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float32, device_map=device)
    print("Model successfully loaded on", device)
except Exception as e:
    print("Error loading model:", e)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/abirsleem/Documents/Abir/NewJOB/MediCureOn/AzureRepos/AI_ML/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/abirsleem/Documents/Abir/NewJOB/MediCureOn/AzureRepos/AI_ML/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/abirsleem/Documents/Abir/NewJOB/Med

Using device: cpu
Model successfully loaded on cpu


In [2]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


Torch version: 2.2.2
CUDA available: False
Device count: 0
GPU Name: No GPU found


In [3]:
import torch
print("MPS available:", torch.backends.mps.is_available())


MPS available: True


In [67]:
import torch
import time
from transformers import BertTokenizer, BertForQuestionAnswering

# Define model and tokenizer
MODEL_NAME = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForQuestionAnswering.from_pretrained(MODEL_NAME)

# Sample question and context
question = "What is deep learning?"
context = "Deep learning is a subset of machine learning that uses neural networks with multiple layers."

# Tokenize input (Converts text into token IDs (numbers).return_tensors="pt" → Ensures PyTorch format)
inputs = tokenizer(question, context, return_tensors="pt")

def test_inference(device_name):
    """Runs inference on the given device and records time."""
    device = torch.device(device_name)
    model.to(device)
    inputs_on_device = {key: val.to(device) for key, val in inputs.items()}

    # Run inference and measure time
    start_time = time.time()
    # torch.no_grad() → Disables gradient calculation (faster inference)
    with torch.no_grad():
        outputs = model(**inputs_on_device)
    end_time = time.time()

    elapsed_time = end_time - start_time
    print(f"Inference Time on {device_name.upper()}: {elapsed_time:.4f} seconds")

# Run on CPU
test_inference("cpu")

# Run on MPS (if available)
if torch.backends.mps.is_available():
    test_inference("mps")
else:
    print("MPS is not available on this system.")

Inference Time on CPU: 3.8397 seconds
Inference Time on MPS: 0.5779 seconds


In [1]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Define question and context
question = "What is deep learning?"
context = "Deep learning is a subset of machine learning that uses neural networks with multiple layers."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")


  from .autonotebook import tqdm as notebook_tqdm



Device: MPS
Inference Time: 2.2608 seconds
Model's Answer: a subset of machine learning

Device: CPU
Inference Time: 0.3958 seconds
Model's Answer: a subset of machine learning


In [2]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Define question and context
question = "What are the potential applications of quantum computing, and how does quantum entanglement contribute to these applications?"
context = "Quantum computing is an emerging field that leverages the principles of quantum mechanics to process information in fundamentally new ways. In classical computing, data is processed in binary units called bits, which can be either 0 or 1. However, quantum computing introduces quantum bits, or qubits, which can exist in multiple states simultaneously due to a phenomenon known as superposition. This allows quantum computers to perform complex calculations at much higher speeds than classical computers in certain applications. Quantum entanglement, another key quantum phenomenon, enables qubits to be correlated with each other, even when separated by vast distances. This opens up possibilities for new technologies, such as quantum cryptography and quantum teleportation. Despite its potential, quantum computing is still in the experimental stage, and practical, large-scale quantum computers have yet to be realized. Researchers are exploring various quantum algorithms, such as Shor’s algorithm for factoring large numbers and Grover’s algorithm for searching unsorted databases, which promise to revolutionize fields like cryptography, optimization, and machine learning."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")



Device: MPS
Inference Time: 4.3032 seconds
Model's Answer: quantum cryptography and quantum teleportation

Device: CPU
Inference Time: 4.1950 seconds
Model's Answer: quantum cryptography and quantum teleportation


In [3]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Define question and context
question = "What are the key differences between using a CPU and a GPU for training a deep learning model?"
context = "In machine learning (ML), the choice of hardware can significantly impact model training and inference times. CPUs (Central Processing Units) are general-purpose processors that handle a wide range of tasks. They are good at handling sequential tasks but struggle with parallel tasks, which makes them less suitable for computationally intensive ML tasks like deep learning. On the other hand, GPUs (Graphics Processing Units) are designed to handle many tasks simultaneously, making them ideal for parallel processing. This is particularly useful in machine learning, where large amounts of data need to be processed in parallel. GPUs are widely used in training deep neural networks and other ML models because they can perform multiple calculations at once, reducing the time required for training. However, they are more expensive than CPUs and might not be necessary for smaller datasets or simpler models."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")



Device: MPS
Inference Time: 5.0591 seconds
Model's Answer: they are more expensive than CPUs

Device: CPU
Inference Time: 5.1917 seconds
Model's Answer: they are more expensive than CPUs


In [71]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="t5-large")

# Define question and context
question = "What are the key differences between using a CPU and a GPU for training a deep learning mode?"
context = "In machine learning (ML), the choice of hardware can significantly impact model training and inference times. CPUs (Central Processing Units) are general-purpose processors that handle a wide range of tasks. They are good at handling sequential tasks but struggle with parallel tasks, which makes them less suitable for computationally intensive ML tasks like deep learning. On the other hand, GPUs (Graphics Processing Units) are designed to handle many tasks simultaneously, making them ideal for parallel processing. This is particularly useful in machine learning, where large amounts of data need to be processed in parallel. GPUs are widely used in training deep neural networks and other ML models because they can perform multiple calculations at once, reducing the time required for training. However, they are more expensive than CPUs and might not be necessary for smaller datasets or simpler models."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")



Device: MPS
Inference Time: 3.3236 seconds
Model's Answer: Processing Units) are general-purpose processors that handle a

Device: CPU
Inference Time: 3.0532 seconds
Model's Answer: Processing Units) are general-purpose processors that handle a


In [72]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Define question and context
question = "What are the key differences between using a CPU and a GPU for training a deep learning model?"
context = "In machine learning (ML), the choice of hardware can significantly impact model training and inference times. CPUs (Central Processing Units) are general-purpose processors that handle a wide range of tasks. They are good at handling sequential tasks but struggle with parallel tasks, which makes them less suitable for computationally intensive ML tasks like deep learning. On the other hand, GPUs (Graphics Processing Units) are designed to handle many tasks simultaneously, making them ideal for parallel processing. This is particularly useful in machine learning, where large amounts of data need to be processed in parallel. GPUs are widely used in training deep neural networks and other ML models because they can perform multiple calculations at once, reducing the time required for training. However, they are more expensive than CPUs and might not be necessary for smaller datasets or simpler models."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")



Device: MPS
Inference Time: 0.8949 seconds
Model's Answer: they are more expensive

Device: CPU
Inference Time: 0.8218 seconds
Model's Answer: they are more expensive


In [73]:
import torch
import time
from transformers import pipeline
import transformers
transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="albert-base-v2")

# Define question and context
question = "What are the key differences between using a CPU and a GPU for training a deep learning mode?"
context = "In machine learning (ML), the choice of hardware can significantly impact model training and inference times. CPUs (Central Processing Units) are general-purpose processors that handle a wide range of tasks. They are good at handling sequential tasks but struggle with parallel tasks, which makes them less suitable for computationally intensive ML tasks like deep learning. On the other hand, GPUs (Graphics Processing Units) are designed to handle many tasks simultaneously, making them ideal for parallel processing. This is particularly useful in machine learning, where large amounts of data need to be processed in parallel. GPUs are widely used in training deep neural networks and other ML models because they can perform multiple calculations at once, reducing the time required for training. However, they are more expensive than CPUs and might not be necessary for smaller datasets or simpler models."

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, device=0 if device == "mps" else -1)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")



Device: MPS
Inference Time: 1.6644 seconds
Model's Answer: general-purpose processors that handle a wide range of tasks. They are

Device: CPU
Inference Time: 1.6033 seconds
Model's Answer: general-purpose processors that handle a wide range of tasks. They are


In [74]:
import torch
import time
from transformers import pipeline, AutoTokenizer
import transformers

transformers.utils.logging.set_verbosity_error()  # Suppresses unnecessary warnings

# Load QA pipeline and tokenizer
model_name = "deepset/roberta-large-squad2"
qa_pipeline = pipeline("question-answering", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define question and context
question = "What are the key differences between using a CPU and a GPU for training a deep learning model?"
context = """In machine learning (ML), the choice of hardware can significantly impact model training and inference times. 
CPUs (Central Processing Units) are general-purpose processors that handle a wide range of tasks. They are good at handling sequential 
tasks but struggle with parallel tasks, which makes them less suitable for computationally intensive ML tasks like deep learning. 
On the other hand, GPUs (Graphics Processing Units) are designed to handle many tasks simultaneously, making them ideal for parallel 
processing. This is particularly useful in machine learning, where large amounts of data need to be processed in parallel. GPUs are 
widely used in training deep neural networks and other ML models because they can perform multiple calculations at once, reducing 
the time required for training. However, they are more expensive than CPUs and might not be necessary for smaller datasets or 
simpler models."""

# Token count
context_tokens = tokenizer.encode(context, add_special_tokens=False)
question_tokens = tokenizer.encode(question, add_special_tokens=False)
combined_tokens = tokenizer.encode(question, context, truncation=True, max_length=tokenizer.model_max_length)

total_context_tokens = len(context_tokens)
total_question_tokens = len(question_tokens)
total_combined_tokens = len(combined_tokens)
max_token_limit = tokenizer.model_max_length

# Print token statistics
print(f"Total Tokens in Context: {total_context_tokens}")
print(f"Total Tokens in Question: {total_question_tokens}")
print(f"Total Tokens (Question + Context Combined): {total_combined_tokens}")
print(f"Max Token Limit of {model_name}: {max_token_limit}")

# Function to run inference and measure time
def test_pipeline(device):
    start_time = time.time()
    answer = qa_pipeline(question=question, context=context, truncation=True)
    end_time = time.time()
    
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model's Answer: {answer['answer']}")

# Run on MPS if available
if torch.backends.mps.is_available():
    test_pipeline("mps")
else:
    print("\nMPS is not available on this system.")

# Run on CPU
test_pipeline("cpu")


Total Tokens in Context: 179
Total Tokens in Question: 19
Total Tokens (Question + Context Combined): 202
Max Token Limit of deepset/roberta-large-squad2: 512

Device: MPS
Inference Time: 3.1781 seconds
Model's Answer: they are more expensive than CPUs

Device: CPU
Inference Time: 2.9168 seconds
Model's Answer: they are more expensive than CPUs


In [78]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load LLaMA 3.2 model
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # You can change to a smaller version if needed

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define prompt
prompt = "Explain the key differences between using a CPU and a GPU for deep learning."

# Function to test inference time
def run_inference(device):
    # Load model on specified device
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16 if device == "mps" else torch.float32)
    model.to(device)

    # Create pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "mps" else -1)

    # Measure inference time
    start_time = time.time()
    response = generator(prompt, max_length=100, do_sample=True)
    end_time = time.time()

    # Print results
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model Response: {response[0]['generated_text']}")


# Run on CPU
run_inference("cpu")



Device: CPU
Inference Time: 5.1848 seconds
Model Response: Explain the key differences between using a CPU and a GPU for deep learning. What is the maximum number of parameters that can be processed efficiently using a GPU compared to a CPU?


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32, device_map="cpu")  # Force CPU

# Initialize pipeline
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define prompt
prompt = "What are the advantages of using a GPU over a CPU in deep learning?"

# Run inference
output = llm_pipeline(prompt, max_length=100)
print(output[0]['generated_text'])


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.34s/it]
Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: 

In [None]:
# Run it on Azure notebook
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

# Model Name
model_name = "microsoft/Phi-3-mini-128k-instruct"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Sample prompt
prompt = "Explain the importance of machine learning in healthcare."

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt")

# Function to measure inference time
def measure_inference_time(model, inputs, device):
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move input tensors to device
    
    # Warm-up run (helps get accurate timing by avoiding initialization overhead)
    model.generate(**inputs, max_new_tokens=50)

    torch.cuda.synchronize() if device == "cuda" else None  # Ensure previous ops are done

    # Measure inference time
    start_time = time.time()
    output = model.generate(**inputs, max_new_tokens=50)
    torch.cuda.synchronize() if device == "cuda" else None  # Ensure completion

    end_time = time.time()
    inference_time = end_time - start_time

    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return inference_time, output_text

### **1️⃣ Run on CPU**
print("Loading model on CPU...")
model_cpu = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32)
cpu_time, cpu_output = measure_inference_time(model_cpu, inputs, "cpu")
print(f"\nCPU Inference Time: {cpu_time:.2f} seconds")
print(f"CPU Output: {cpu_output}")

### **2️⃣ Run on GPU (FP16 for memory efficiency)**
if torch.cuda.is_available():
    print("\nLoading model on GPU...")
    model_gpu = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
    gpu_time, gpu_output = measure_inference_time(model_gpu, inputs, "cuda")

    print(f"\nGPU Inference Time: {gpu_time:.2f} seconds")
    print(f"GPU Output: {gpu_output}")

    # Speed improvement
    speedup = cpu_time / gpu_time
    print(f"\n🚀 Speedup (GPU vs CPU): {speedup:.2f}x faster!")

else:
    print("\n❌ No GPU detected! Skipping GPU test.")


In [None]:
# Run it on Azure notebook
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load LLaMA model
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" 

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define prompt
prompt = "Explain the key differences between using a CPU and a GPU for deep learning."

# Function to test inference time
def run_inference(device):
    # Load model on specified device
    dtype = torch.float16 if device == "cuda" else torch.float32
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=dtype)
    model.to(device)

    # Create pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

    # Measure inference time
    start_time = time.time()
    response = generator(prompt, max_length=100, do_sample=True)
    end_time = time.time()

    # Print results
    print(f"\nDevice: {device.upper()}")
    print(f"Inference Time: {end_time - start_time:.4f} seconds")
    print(f"Model Response: {response[0]['generated_text']}")

# Check available devices and run inference
if torch.cuda.is_available():
    print("\nRunning on GPU (CUDA)...")
    run_inference("cuda")
else:
    print("\nGPU is not available. Running on CPU...")

print("\nRunning on CPU...")
run_inference("cpu")

