In [3]:
!huggingface-cli login

^C


In [1]:
import requests
from bs4 import BeautifulSoup
import re
import guidance
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def setup_model():
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
    
    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None,
        low_cpu_mem_usage=True
    )
    
    # Set up guidance with the model
    Llama3 = guidance.models.Transformers(model, tokenizer=tokenizer)
    return model, tokenizer

def fetch_wikipedia_content(url):
    print("Fetching Wikipedia content...")
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract main content paragraphs
    content_div = soup.find('div', {'id': 'mw-content-text'})
    paragraphs = content_div.find_all('p')
    
    # Clean and combine paragraphs
    text = ""
    for p in paragraphs[:10]:  # Limit to first 10 paragraphs for processing
        clean_text = re.sub(r'\[.*?\]', '', p.get_text())  # Remove citations
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()  # Clean whitespace
        if len(clean_text) > 20:  # Only include substantial paragraphs
            text += clean_text + " "
    
    return text.strip()

def create_ner_template():
    ner_template = guidance('''
{{#system~}}
You are an expert at Named Entity Recognition (NER). Your task is to identify and tag named entities in text.

Tag the following entity types:
- PER: Person names
- ORG: Organizations, companies, institutions
- LOC: Locations, places, countries, cities
- NUM: Numbers (including years, quantities)
- DATE: Dates and time expressions
- EVENT: Events, competitions, games

Format your output exactly like this example:
Input: "Padma sold 23 umbrellas in Guatemala"
Output: *Padma: PER* *sold:* *23: NUM* *umbrellas:* *in:* *Guatemala: LOC*

Rules:
1. Put entity tags after colons (word: TAG)
2. Surround tagged words with asterisks (*word: TAG*)
3. Leave untagged words as *word:* (with colon but no tag)
4. Tag ALL words in the sentence
{{~/system}}

{{#user~}}
Please perform Named Entity Recognition on this text:
"{{text}}"

Provide the tagged output in the specified format:
{{~/user}}

{{#assistant~}}
{{gen 'tagged_output' max_tokens=500 temperature=0.1}}
{{~/assistant}}
''')
    return ner_template

def process_text_chunks(text, chunk_size=200):
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        if len(current_chunk + sentence) < chunk_size:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def run_ner_analysis(text_chunks, ner_template, max_chunks=5):
    """Run NER analysis on text chunks"""
    print(f"Processing {min(len(text_chunks), max_chunks)} text chunks...")
    results = []
    
    for i, chunk in enumerate(text_chunks[:max_chunks]):
        print(f"Processing chunk {i+1}/{min(len(text_chunks), max_chunks)}")
        print(f"Chunk: {chunk[:100]}...")
        
        try:
            # Run the guidance template
            result = ner_template(text=chunk)
            tagged_output = result['tagged_output'].strip()
            
            results.append({
                'original': chunk,
                'tagged': tagged_output
            })
            
            print(f"Tagged: {tagged_output[:100]}...")
            print("-" * 80)
            
        except Exception as e:
            print(f"Error processing chunk {i+1}: {e}")
            results.append({
                'original': chunk,
                'tagged': f"Error: {str(e)}"
            })
    
    return results

def main():
    
    try:
        # Setup
        print("Setting up Llama3 model...")
        model, tokenizer = setup_model()
        
        # Fetch Wikipedia content
        wiki_url = "https://en.wikipedia.org/wiki/Summer_Olympic_Games"
        text = fetch_wikipedia_content(wiki_url)
        print(f"Fetched {len(text)} characters from Wikipedia")
        
        # Create NER template
        ner_template = create_ner_template()
        
        # Process text
        text_chunks = process_text_chunks(text)
        print(f"Split text into {len(text_chunks)} chunks")
        
        # Run NER analysis
        results = run_ner_analysis(text_chunks, ner_template)
        
        # Display results
        print("\n" + "="*80)
        print("NAMED ENTITY RECOGNITION RESULTS")
        print("="*80)
        
        for i, result in enumerate(results, 1):
            print(f"\nCHUNK {i}:")
            print(f"Original: {result['original']}")
            print(f"Tagged:   {result['tagged']}")
            print("-" * 80)
        
        # Save results to file
        with open('ner_results.txt', 'w', encoding='utf-8') as f:
            f.write("Named Entity Recognition Results\n")
            f.write("="*50 + "\n\n")
            for i, result in enumerate(results, 1):
                f.write(f"CHUNK {i}:\n")
                f.write(f"Original: {result['original']}\n")
                f.write(f"Tagged: {result['tagged']}\n")
                f.write("-" * 50 + "\n")
        
        print(f"\nResults saved to 'ner_results.txt'")
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    # Install required packages first
    print("Make sure you have installed the required packages:")
    print("pip install guidance transformers torch beautifulsoup4 requests accelerate")
    print("\nNote: You'll need access to Llama3 model (requires Hugging Face authentication)")
    print("Run: huggingface-cli login")
    print("\n" + "="*80)
    
    main()

Using device: cpu
Make sure you have installed the required packages:
pip install guidance transformers torch beautifulsoup4 requests accelerate

Note: You'll need access to Llama3 model (requires Hugging Face authentication)
Run: huggingface-cli login

Setting up Llama3 model...
Loading tokenizer and model...
Error in main execution: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct.
401 Client Error. (Request ID: Root=1-6844ae4c-3c884eaa71e906ff5edfe732;3a847a96-d05b-4220-9058-b7c6284cefe6)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.


Traceback (most recent call last):
  File "C:\Users\zyad3\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\huggingface_hub\utils\_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "C:\Users\zyad3\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\requests\models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/resolve/main/config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\zyad3\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\utils\hub.py", line 470, in cached_files
    hf_h