# VeraGPT

In [None]:
# --- Set your GitHub repo URL ---
REPO_URL = "https://github.com/ankush357159/fusion-gpt.git"
REPO_DIR = "/content/fusion-gpt"

# Clone (or re-clone) the repo
import os

# Ensure we are in a stable directory before attempting to remove and clone
%cd /content

if os.path.isdir(REPO_DIR):
    !rm -rf "$REPO_DIR"
!git clone "$REPO_URL" "$REPO_DIR"

In [None]:
# Install veraGPT dependencies
%cd /content/fusion-gpt/veraGPT
!pip -q install -r requirements.txt

In [None]:
# (Optional) If your model is gated/private, set your HF token
import os
os.environ["HUGGINGFACE_HUB_TOKEN"] = ""  # <- paste token or leave blank for public models

In [None]:
# Run a single prompt (non-interactive)
%cd /content/fusion-gpt/veraGPT
!python src/main.py --prompt "Write a short welcome message for veraGPT."


## OPTION 2: Persistent Model Server (RECOMMENDED for Colab)

**Load model once, then ask multiple questions without reloading.**  
This is **10-100x faster** for subsequent prompts since the model stays in memory.

In [None]:
# Load the model ONCE (takes ~60s on T4 GPU)
%cd /content/fusion-gpt/veraGPT
import sys
sys.path.insert(0, '/content/fusion-gpt/veraGPT/src')

from server import ModelServer

# Initialize and load model
server = ModelServer()
server.load()  # This takes time - but only run once!

print("\nâœ… Model loaded! Now you can ask questions quickly.")

In [None]:
# Ask a question (FAST - no model reloading!)
response = server.ask(
    "Please explain Newton's second law of motion",
    show_timing=True
)
print(response)

In [None]:
# Ask another question (still FAST!)
response = server.ask(
    "What is quantum entanglement?",
    show_timing=True
)
print(response)

## Notes
- For quantized loading, add `--quant 4` or `--quant 8` (CUDA only).
- To load a LoRA adapter, add `--lora-path /path/to/adapter`.
- Interactive mode is not ideal in Colab; prefer the single-prompt cell.