# Scraper

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    """
    Scrapes the given URL and returns a dictionary with:
      - 'title': The <title> text (or a fallback if none is found).
      - 'description': The content of the <meta name='description'> tag if available.
      - 'snippet': The first 250 characters of the visible page text.
      
    Parameters:
        url (str): The URL of the website to scrape.

    Returns:
        dict: Contains 'title', 'description', and 'snippet' on success; 
              or None if an error occurs.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError if the status is 4xx/5xx
        
        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Get <title>
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.get_text().strip()
        else:
            title = "No title found"
        
        # Get <meta name="description" ... >
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag and 'content' in description_tag.attrs:
            description = description_tag['content'].strip()
        else:
            description = "No description found"
        
        # Get all visible text
        full_text = soup.get_text(separator=' ', strip=True)
        
        # Extract first 250 characters
        snippet = full_text[:10000]
        
        return {
            "title": title,
            "description": description,
            "snippet": snippet
        }
    
    except Exception as e:
        print(f"Failed to scrape {url}. Reason: {e}")
        return None

if __name__ == "__main__":
    # Example usage:
    url = input("Enter URL:")
    result = scrape_website(url)
    if result:
        print("Title:", result["title"])
        print("Description:", result["description"])
        print("Snippet:", result["snippet"])
    else:
        print("Scraping failed.")
        


# Model

In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer


In [13]:

def scrape_website(url):
    """
    Scrapes the given URL and returns a dictionary with:
      - 'title': The <title> text (or a fallback if none is found).
      - 'description': The content of the <meta name='description'> tag if available.
      - 'snippet': The first 10,000 characters of the visible page text.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Get <title>
        title_tag = soup.find('title')
        title = title_tag.get_text().strip() if title_tag else "No title found"
        
        # Get <meta name="description">
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag['content'].strip() if description_tag and 'content' in description_tag.attrs else "No description found"
        
        # Get all visible text
        full_text = soup.get_text(separator=' ', strip=True)
        snippet = full_text[:10000]  # Adjust as needed
        
        return {
            "title": title,
            "description": description,
            "snippet": snippet
        }
    except Exception as e:
        print(f"Failed to scrape {url}. Reason: {e}")
        return None

# Load your model and tokenizer.
# Replace model_path with your actual model directory.
model_path = "/Users/bryce/Desktop/INLS697/INLS697_proj/DeBERTa/final_model_checkpoint"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a text-classification pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

def classify_scraped_text(url):
    """
    Scrapes the URL, prepares the text by combining title, description, and snippet,
    then uses the classification pipeline to get the predicted label and confidence.
    """
    data = scrape_website(url)
    print(data)
    if data is None:
        print("Scraping failed for URL:", url)
        return
    
    # Combine parts of the scraped text. Adjust which parts to include as needed.
    text_to_classify = f"{data['title']}\n{data['description']}\n{data['snippet']}"
    
    # If necessary, truncate the text to avoid input length issues.
    # For example, keep only the first 512 tokens if your model has a 512-token limit.
    # Here we assume the pipeline handles tokenization appropriately.
    
    results = nlp(text_to_classify)
    
    # Print the results. Each result dict contains keys "label" and "score".
    for res in results:
        print("Label:", res["label"])
        print("Confidence:", res["score"])
    
    return results

if __name__ == "__main__":
    url = input("Enter URL to classify: ")
    classify_scraped_text(url)


Device set to use cpu


Failed to scrape https://www.reuters.com/world/middle-east/israeli-military-conducts-strikes-hamas-targets-gaza-army-says-2025-03-18/. Reason: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/middle-east/israeli-military-conducts-strikes-hamas-targets-gaza-army-says-2025-03-18/
None
Scraping failed for URL: https://www.reuters.com/world/middle-east/israeli-military-conducts-strikes-hamas-targets-gaza-army-says-2025-03-18/
