# VeraGPT

In [None]:
# --- Set your GitHub repo URL ---
REPO_URL = "https://github.com/ankush357159/fusion-gpt.git"
REPO_DIR = "/content/fusion-gpt"

# Clone (or re-clone) the repo
import os

# Ensure we are in a stable directory before attempting to remove and clone
%cd /content

if os.path.isdir(REPO_DIR):
    !rm -rf "$REPO_DIR"
!git clone "$REPO_URL" "$REPO_DIR"

In [None]:
# Install veraGPT dependencies
%cd /content/fusion-gpt/veraGPT
!pip -q install -r requirements.txt

In [None]:
# (Optional) If your model is gated/private, set your HF token
import os
os.environ["HUGGINGFACE_HUB_TOKEN"] = ""  # <- paste token or leave blank for public models

### Step 1: Choose Your Model Preset

Pick a model based on your hardware and speed/quality needs:

In [None]:
# Auto-detect hardware and show available presets
%cd /content/fusion-gpt/veraGPT
import sys
sys.path.insert(0, '/content/fusion-gpt/veraGPT/src')

import torch
from model_presets import PRESETS, recommend_preset

# Detect hardware
has_gpu = torch.cuda.is_available()
vram_gb = 16 if has_gpu else 0  # T4 has 16GB

print("HARDWARE DETECTED")
print("="*60)
if has_gpu:
    print("GPU: T4 GPU available (16 GB VRAM)")
    print("You can use any model below!")
else:
    print("CPU Only: No GPU detected")
    print("Only 'tiny' preset recommended (others will fail with OOM)")
    print("\nTo enable GPU: Runtime → Change runtime type → T4 GPU")

# Show available presets
print("\nAVAILABLE MODEL PRESETS")
print("="*60)
for key, preset in PRESETS.items():
    icon = "YES" if (has_gpu or key == "tiny") else "NO"
    print(f"\n{icon} '{key}': {preset.name}")
    print(f"   Model: {preset.model_id}")
    print(f"   Quality: {preset.quality} | GPU Speed: {preset.speed_gpu}")
    if key == "tiny":
        print(f"Works on: CPU + GPU (fastest option)")
    else:
        print(f"Requires: T4 GPU (min {preset.min_vram_gb}GB VRAM)")

# Auto-recommend
recommended = recommend_preset(has_gpu, vram_gb)
print("\n" + "="*60)
print(f"AUTO-RECOMMENDED: '{recommended}' ({PRESETS[recommended].name})")
print("="*60)

# Set default (user can override below)
SELECTED_PRESET = recommended
print(f"\nCurrent selection: '{SELECTED_PRESET}'")
print("To change: Edit SELECTED_PRESET in the cell below")

In [None]:
# CHANGE THIS to override auto-selection
# Options: 'tiny', 'phi2', 'phi3', 'mistral', 'llama13'

SELECTED_PRESET = "tiny"  # ← Change this!

# Validate
from model_presets import PRESETS
if SELECTED_PRESET not in PRESETS:
    print(f"Invalid preset: '{SELECTED_PRESET}'")
    print(f"Valid options: {list(PRESETS.keys())}")
else:
    preset = PRESETS[SELECTED_PRESET]
    print(f"Selected: '{SELECTED_PRESET}' ({preset.name})")
    print(f"   Model: {preset.model_id}")
    print(f"   Quality: {preset.quality} | Speed: {preset.speed_gpu}")
    
    # Hardware check
    import torch
    if not torch.cuda.is_available() and SELECTED_PRESET != "tiny":
        print(f"\nWARNING: '{SELECTED_PRESET}' requires GPU but CPU detected!")
        print("This will likely fail with OOM or be extremely slow.")
        print("Recommended: SELECTED_PRESET = 'tiny'")

In [None]:
## Step 2: Load Model with Selected Preset

# This will load the model once and keep it in memory for fast responses (30-60 seconds).

In [None]:
# Initialize model using selected preset
from config import Config
from device_manager import DeviceManager
from model_loader import ModelLoader
from inference_engine import InferenceEngine
from server import ModelServer

print(f"Loading model with preset: '{SELECTED_PRESET}'...")

# Create config from preset
# enable_quantization=True for GPU (4-bit), False for CPU
import torch
enable_quant = torch.cuda.is_available()  # Only quantize on GPU

config = Config.from_preset(SELECTED_PRESET, enable_quantization=enable_quant)
print(f"Model ID: {config.model.model_name_or_path}")
print(f"Quantization: {'Enabled (4-bit)' if config.quantization.enabled else 'Disabled'}")

# Initialize server (loads model once)
server = ModelServer(config)
print("\nLoading model (this may take 30-60 seconds)...")
server.load()
print("Model loaded and ready!\n")

### Step 3: Chat with the Model

Try asking questions! The model is already loaded, so each response will be fast.

In [None]:
# Ask your first question
prompt = "Explain what is Python programming language in 2 sentences."

print(f"Question: {prompt}\n")
response = server.ask(prompt, show_timing=True)
print(f"\nAnswer: {response}")

In [None]:
# Ask a question (FAST - no model reloading!)
response = server.ask(
    "Please explain Newton's second law of motion",
    show_timing=True
)
print(response)

In [None]:
# Ask another question (still FAST!)
response = server.ask(
    "What is quantum entanglement?",
    show_timing=True
)
print(response)

### Optional: Switch to a Different Model

Want to try another model? Change the preset and reload.

In [None]:
# Change preset and reload
SELECTED_PRESET = "phi2"  # Try: 'phi2', 'phi3', 'mistral', 'llama13'

# Reload with new preset
config = Config.from_preset(SELECTED_PRESET, enable_quantization=torch.cuda.is_available())
server = ModelServer(config)
print(f"Switching to {SELECTED_PRESET}...")
server.load()
print("Model switched!")

### Troubleshooting Guide

### Issue: Process Killed / OOM (Out of Memory)

**Cause**: Model requires more RAM than available

**Solution Options**:

**Option 1: Switch to Smaller Model (Recommended)**

```python
# In cell 7, change to:
SELECTED_PRESET = "tiny"  # TinyLlama works on CPU with only 2-3GB RAM
```

**Option 2: Enable GPU**

```
1. Runtime → Change runtime type → T4 GPU
2. Re-run all cells
3. Any preset will work on T4 GPU
```

### Model Requirements:

| Preset    | Model           | RAM Needed | CPU Support | GPU Speed   |
| --------- | --------------- | ---------- | ----------- | ----------- |
| `tiny`    | TinyLlama 1.1B  | 2-3 GB     | Yes         | (15 tok/s)  |
| `phi2`    | Microsoft Phi-2 | 6-8 GB     | Tight       | (12 tok/s)  |
| `phi3`    | Microsoft Phi-3 | 8-10 GB    | No          | (10 tok/s)  |
| `mistral` | Mistral 7B      | 14-18 GB   | No          | (3 tok/s)   |
| `llama13` | Llama-2 13B     | 26-32 GB   | No          | (1-2 tok/s) |

### Hardware Available:

- **Colab CPU**: 12 GB RAM → Use `tiny` only
- **Colab T4 GPU**: 16 GB VRAM → Use `tiny`, `phi2`, `phi3`, `mistral`
- **Colab A100 GPU**: 40 GB VRAM → Any preset works

### Speed vs Quality Trade-off:

- **Fastest**: `tiny` (TinyLlama) - Good for simple tasks, casual chat
- **Balanced**: `phi2`, `phi3` - Better reasoning, still fast
- **Best Quality**: `mistral`, `llama13` - Complex tasks, slower

### How to Check Current Hardware:

```python
import torch
print(f"GPU: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
```
