In [None]:
# This seems to work pretty well!

import torch
from transformers import GPTNeoForCausalLM, AutoTokenizer

def ask_question_gptneo(question, context):
    # Load pre-trained model and tokenizer
    model_name = "EleutherAI/gpt-neo-2.7B"
    model = GPTNeoForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Combine context and question into a single prompt
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate the answer
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=10,  # Limit the number of new tokens generated
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=inputs.attention_mask,
        temperature=0.3,  # Lower temperature for more deterministic output
        top_k=5,  # Limit the number of possible next tokens
        do_sample=True  # Enable sampling
    )

    # Decode the generated text
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer part from the generated text
    answer = answer.split("Answer:")[1].strip().split('.')[0]  # Extract the first sentence
    return answer

# Example usage
context = "Andrew's favorite color is violet."
question = "What is Andrew's favorite color?"
answer = ask_question_gptneo(question, context)
print(answer)  # Should print "Violet."

Andrew's favorite color is violet


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import contextlib
# import bitsandbytes as bnb

# Define a null context manager
@contextlib.contextmanager
def null_context():
    yield

def ask_question(model_name, question, context, use_gpu_if_available=True, mixed_precision=False, load_in_4bit=False):

    quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) if load_in_4bit else None

    # Load pre-trained model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Move model to GPU if available
    device = torch.device("cuda" if use_gpu_if_available and torch.cuda.is_available() else "cpu")
    model.to(device)

    # Combine context and question into a single prompt
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Determine the context to use for mixed precision
    context_to_use = torch.autocast("cuda") if mixed_precision and device.type == "cuda" else null_context()

    # Generate the answer with mixed precision
    with context_to_use:
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=10,  # Limit the number of new tokens generated
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            attention_mask=inputs.attention_mask,
            temperature=0.3,  # Lower temperature for more deterministic output
            top_k=5,  # Limit the number of possible next tokens
            do_sample=True  # Enable sampling
        )

    # Decode the generated text
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the answer part from the generated text
    answer = answer.split("Answer:")[1].strip().split('.')[0]  # Extract the first sentence
    return answer

# Example usage
# model_name = "EleutherAI/gpt-neo-2.7B"  # best GPT-related model for my laptop
# model_name = "EleutherAI/gpt-j-6B"  # won't work with any options even memory mapping (removed from code above)
model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'  # works, best deepseek model I can get working
# model_name = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'  # kernel crash
# model_name = 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B'  # kernel crash
use_gpu_if_available = True
mixed_precision = False
load_in_4bit = False  # a bit faster for gpt-neo but sometimes gives wrong answers
context = "Andrew's favorite color is violet."
question = "What is Andrew's favorite color?"
# context = "Andrew's favorite color is violet. Laura\'s favorite color is very different from Andrew\'s."
# question = "What is Laura's favorite color?"
answer = ask_question(model_name, question, context, use_gpu_if_available, mixed_precision, load_in_4bit)
print(answer)  # Should print "Violet."

2025-03-06 04:37:28.697809: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 04:37:28.930844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741253849.025175   64017 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741253849.051405   64017 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 04:37:29.276478: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Andrew's favorite color is violet
