<a href="https://colab.research.google.com/github/arijguest/ML-tagging/blob/main/ML_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**ML-Tagging** A Python project to allow automatic tagging of scraped text content from sitemap(s) using OpenAI 4o-mini. User should change the prompt given to the LLM to best reflect their tagging needs.

In [None]:

# Step 1: Setup and Dependencies
# !pip install requests beautifulsoup4 pandas nltk openai==0.28 tqdm aiohttp

In [None]:
# Import prerequisites
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import openai
import asyncio
import aiohttp
from tqdm.asyncio import tqdm_asyncio

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Helper function to remove stop words
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

In [None]:
# Step 2: Define User Inputs

# Set your OpenAI API key
openai.api_key = 'USER_API_KEY'

# Function to allow user to set sitemap(s) and output directory.
sitemaps = input('Enter the sitemap URLs, comma-separated: ').split(',')
output_directory = input('Enter the output directory: ')

In [None]:
# Step 3: Crawl Sitemaps
def crawl_sitemap(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'xml')
        links = [loc.text for loc in soup.find_all('loc')]
        # Filter out image URLs
        links = [link for link in links if not link.lower().endswith(('.jpg', '.jpeg', '.png'))]
        return links
    except requests.RequestException as e:
        print(f"Error fetching sitemap {sitemap_url}: {e}")
        return []

all_links = []
for sitemap_url in tqdm(sitemaps, desc='Crawling sitemaps'):
    all_links.extend(crawl_sitemap(sitemap_url))

In [None]:
# Step 4: Scrape and Preprocess Webpage Content
def scrape_body_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Attempt to find the main content area
        main_content = soup.find(['main', {'id': 'main-content'}, {'class': 'content'}])
        if not main_content:
            # Fall back to the body tag if no specific main content area is found
            main_content = soup.find('body')

        if main_content:
            text = main_content.get_text(separator=' ', strip=True)
            filtered_text = remove_stopwords(text)
            return filtered_text
        else:
            return "No main content found."
    except requests.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return "Error fetching content."

In [None]:
async def analyze_content(session, url):
    """Analyzes webpage content asynchronously using OpenAI's GPT-4o-mini model."""
    try:
        async with session.get(url) as response:
            body_text = await response.text()
            soup = BeautifulSoup(body_text, 'html.parser')
            # Attempt to find the main content area
            main_content = soup.find(['main', {'id': 'main-content'}, {'class': 'content'}])
            if not main_content:
                # Fall back to the body tag if no specific main content area is found
                main_content = soup.find('body')
            if main_content:
                text = main_content.get_text(separator=' ', strip=True)
                filtered_text = remove_stopwords(text)
            else:
                filtered_text = "No main content found."

        if filtered_text.startswith('Error'):
            return filtered_text

        title = soup.title.string if soup.title else 'No Title'  # Get title
        response = await openai.ChatCompletion.acreate(  # Use acreate for async
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": f"Generate ONLY hyper-specific tags for this webpage, focusing only on "
                           f"material/waste types and processing methods and specific site content. Output ONLY in #tag-name "
                           f"format (e.g., #anaerobic-digestion). Prioritize unique keywords relevant specifically "
                           f"to the title: \"{title}\" and content: \"{filtered_text}\". For example, tags like "
                           f"#WEE-waste should not show up in the Liquid Waste page tags. Avoid general tags like "
                           f"#sustainability, and beware each page will contain mention of all waste types - do not "
                           f"include these unless specific to that page’s title. Do not provide any additional "
                           f"commentary or explanations, just output the hyper-relevant tags to that page."
            }],
            max_tokens=5000
        )
        tags = response['choices'][0]['message']['content'].strip()
        return tags
    except Exception as e:
        print(f"Error analyzing content from {url}: {e}")
        return "Error analyzing content."

In [None]:
# 6. Process and Store Data
async def process_and_store_data(all_links, output_directory):
    """Processes URLs asynchronously, stores results, and displays progress."""
    results = []
    async with aiohttp.ClientSession() as session:
        async for link in tqdm_asyncio(all_links, desc='Processing links'):
            tags = await analyze_content(session, link)  # Call analyze_content here

            async with session.get(link) as response:  # Use the same session
                html_content = await response.text()
                soup = BeautifulSoup(html_content, 'html.parser')
                title = soup.title.string if soup.title else 'No Title'
            results.append({'URL': link, 'Title': title, 'Tags': tags})

    store_data(results, output_directory)  # Call store_data with the final results

# Main Logic
async def main():
    await process_and_store_data(all_links, output_directory)

# Execute the main function directly
await main()  # Start the asynchronous processing