# HHPF Setup Verification

This notebook verifies that your environment is properly configured.

## 1. Import Core Dependencies

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
import transformers
import xgboost
import spacy
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ All core dependencies imported successfully")

✅ All core dependencies imported successfully


## 2. Check Python Version

In [2]:
print(f"Python version: {sys.version}")
assert sys.version_info >= (3, 9), "Python 3.9+ required"
print("✅ Python version is compatible")

Python version: 3.13.1 (v3.13.1:06714517797, Dec  3 2024, 14:00:22) [Clang 15.0.0 (clang-1500.3.9.4)]
✅ Python version is compatible


## 3. Check PyTorch and MPS

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")

if torch.backends.mps.is_available():
    print("✅ GPU acceleration available (MPS)")
    device = torch.device("mps")
else:
    print("⚠️ Using CPU - this is fine for most operations")
    device = torch.device("cpu")

print(f"Device: {device}")

PyTorch version: 2.10.0
MPS available: True
✅ GPU acceleration available (MPS)
Device: mps


## 4. Load spaCy Model

In [4]:
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ spaCy model loaded successfully")
    
    # Test it
    doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Test entities extracted: {entities}")
except:
    print("❌ spaCy model not found. Run: python -m spacy download en_core_web_sm")

✅ spaCy model loaded successfully
Test entities extracted: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


## 5. Check API Configuration

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()

together_key = os.getenv("TOGETHER_API_KEY")
groq_key = os.getenv("GROQ_API_KEY")

if together_key and together_key != "your_together_api_key_here":
    print("✅ Together AI API key configured")
else:
    print("⚠️ Together AI API key not configured")

if groq_key and groq_key != "your_groq_api_key_here":
    print("✅ Groq API key configured")
else:
    print("⚠️ Groq API key not configured")

if not ((together_key and together_key != "your_together_api_key_here") or 
        (groq_key and groq_key != "your_groq_api_key_here")):
    print("\n❌ No API keys configured!")
    print("Edit .env file and add your API key")

✅ Together AI API key configured
✅ Groq API key configured


## 6. Load HHPF Configuration

In [6]:
import sys
sys.path.append('..')

from src.utils import load_config

# Load configurations
datasets_config = load_config('datasets')
model_config = load_config('model')
features_config = load_config('features')

print("✅ All configuration files loaded")
print(f"\nDatasets configured: {list(datasets_config['datasets'].keys())}")
print(f"Default Llama model: {model_config['llama']['default_model']}")

✅ All configuration files loaded

Datasets configured: ['medicine', 'math', 'finance', 'is_agents', 'psychology']
Default Llama model: meta-llama/Llama-3-8b-chat-hf


## 7. Test Transformers Model Loading

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

print("Loading DeBERTa-v3-large (this may take a minute on first run)...")

try:
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    # Note: We're not loading the full model here to save memory
    print("✅ DeBERTa tokenizer loaded successfully")
    print("Note: Full model will be loaded when needed for semantic entropy")
except Exception as e:
    print(f"⚠️ Could not load DeBERTa: {e}")
    print("This is needed for semantic entropy. Check your internet connection.")

Loading DeBERTa-v3-large (this may take a minute on first run)...


config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Could not extract SentencePiece model from /Users/aviv.gross/.cache/huggingface/hub/models--microsoft--deberta-v3-large/snapshots/64a8c8eab3e352a784c658aef62be1662607476f/spm.model using sentencepiece library due to 
SentencePieceExtractor requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
. Falling back to TikToken extractor.


⚠️ Could not load DeBERTa: `tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.
This is needed for semantic entropy. Check your internet connection.


## 8. Summary

In [8]:
print("="*60)
print("  Setup Verification Complete")
print("="*60)
print("\n✅ Your environment is ready for HHPF development!")
print("\nNext steps:")
print("1. Place your datasets in data/raw/")
print("2. Open 01_data_exploration.ipynb")
print("3. Start with a single domain (Math recommended)")

  Setup Verification Complete

✅ Your environment is ready for HHPF development!

Next steps:
1. Place your datasets in data/raw/
2. Open 01_data_exploration.ipynb
3. Start with a single domain (Math recommended)
