In [1]:
import os
import json
import logging
import math
import string
import nltk
import tensorflow_hub as hub
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

# Constants
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
DATA_DIR = "data"
TITLE_BOOST = 10
VARIANT_BOOST = 5
BM25_WEIGHT = 0.3
SIMILARITY_WEIGHT = 0.7

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load Stopwords and Lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load Universal Sentence Encoder (USE)
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Utility Functions
def extract_product_id(url):
    """Extracts the product ID from a URL."""
    match = re.search(r"/product/(\d+)", url)
    return match.group(1) if match else None

def extract_variant_from_url(url):
    """Extracts the product variant from the URL."""
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return query_params.get("variant", [""])[0]  # Get first variant if exists

def get_product_details(url, title_cache={}):
    """Fetches the product title via scraping and extracts variant from URL."""
    product_id = extract_product_id(url)
    
    if product_id in title_cache:
        return title_cache[product_id], extract_variant_from_url(url)

    try:
        logging.info(f"🔍 Scraping: {url}")
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract title correctly from <h3 class="product-title">
        title_element = soup.select_one("h3.product-title")  # Correct selector
        if not title_element:
            title_element = soup.find("meta", {"property": "og:title"})  # Fallback
        if not title_element:
            title_element = soup.find("title")  # Another fallback

        # Clean title if extracted from <meta> or <title>
        title = title_element.text.strip() if title_element else "Unknown Title"

        # Cache title to avoid duplicate scrapes
        title_cache[product_id] = title  

        return title, extract_variant_from_url(url)
    
    except requests.RequestException as e:
        logging.error(f"❌ Error fetching {url}: {e}")
    
    return "Unknown Title", extract_variant_from_url(url)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\youce\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\youce\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\youce\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2025-02-08 22:48:14,943 - INFO - Using C:\Users\youce\AppData\Local\Temp\tfhub_modules to cache modules.












2025-02-08 22:48:23,797 - INFO - Fingerprint not found. Saved model loading will continue.
2025-02-08 22:48:23,797 - INFO - path_and_singleprint metric could not be logged. Saved model loading will continue.


In [2]:
def tokenize(title, variant, description, title_boost=TITLE_BOOST, variant_boost=VARIANT_BOOST):
    """Tokenizes text, giving extra weight to title and variant."""
    text = f"{(title + ' ') * title_boost} {(variant + ' ') * variant_boost} {description}"
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text) if token not in stop_words]

def get_sentence_embedding(title, variant, description):
    """Returns a sentence embedding, emphasizing title and variant."""
    boosted_text = (title + " ") * TITLE_BOOST + (variant + " ") * VARIANT_BOOST + description
    embedding = embed([boosted_text]).numpy()[0]
    return embedding / np.linalg.norm(embedding)  # Normalize

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_product_urls(base_url):
    product_urls = []
    page_number = 1

    while True:
        url = f"{base_url}/products?page={page_number}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        product_links = soup.select('a[href^="/product/"]')

        if not product_links:
            print(f"No product links found on page {page_number}.")
            break

        for link in product_links:
            full_url = base_url + link['href']
            if full_url not in product_urls:
                product_urls.append(full_url)

        print(f"Page {page_number} processed. Found {len(product_links)} product links.")
        page_number += 1

    return product_urls

if __name__ == "__main__":
    base_url = "https://web-scraping.dev/products"
    product_urls = scrape_product_urls(base_url)
    if product_urls:
        print("\nScraped Product URLs:")
        for url in product_urls:
            print(url)
    else:
        print("No product URLs were found.")


No product links found on page 1.
No product URLs were found.


In [5]:
import json
import requests
from collections import Counter
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import logging

def load_json(filename):
    with open(filename, "r", encoding="utf-8") as file:
        return json.load(file)

def scrape_product_data(url):
    """Scrapes title, description, and variants from the product page."""
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        logging.warning(f"Failed to fetch {url}, status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    title = soup.find("h3", class_="card-title product-title")
    description = soup.find("p", class_="product-description")
    
    # Extract all available variants
    variants = [variant.text.strip() for variant in soup.select(".variants a")]
    
    return {
        "title": title.text.strip() if title else "",
        "description": description.text.strip() if description else "",
        "variants": variants
    }

def tokenize(text):
    """Splits text into lowercase tokens and counts occurrences."""
    words = text.lower().split()
    return Counter(words)

def process_data(json_file):
    raw_data = load_json(json_file)
    structured_data = []
    df_counter = Counter()
    
    for keyword, url_dict in raw_data.items():
        for url, metadata in url_dict.items():
            scraped_data = scrape_product_data(url)
            if not scraped_data:
                continue
            
            # Tokenize title, description, and variants
            title_tokens = tokenize(scraped_data["title"])
            description_tokens = tokenize(scraped_data["description"])
            variant_tokens = Counter()
            for variant in scraped_data["variants"]:
                variant_tokens.update(tokenize(variant))
            
            # Merge all token counts
            total_tokens = title_tokens + description_tokens + variant_tokens
            df_counter.update(total_tokens)
            
            structured_data.append({
                "id": hash(url),
                "title": scraped_data["title"],
                "variants": scraped_data["variants"],
                "description": scraped_data["description"],
                "url": url,
                "tokens": dict(total_tokens),
            })
    
    logging.info(f"Processed {len(structured_data)} products.")
    return structured_data, df_counter

if __name__ == "__main__":
    json_file = "description_index.json"
    structured_data, df_counter = process_data(json_file)
    
    # Save structured data
    with open("structured_data.json", "w", encoding="utf-8") as f:
        json.dump(structured_data, f, indent=4, ensure_ascii=False)
    
    # Save token duplication factors
    with open("token_frequencies.json", "w", encoding="utf-8") as f:
        json.dump(df_counter, f, indent=4, ensure_ascii=False)
    
    print("Scraping and processing completed.")
    
    # Display output for review
    print("\nSample Structured Data:")
    print(json.dumps(structured_data[:3], indent=4, ensure_ascii=False))  # Show first 3 products
    
    print("\nTop 10 Frequent Tokens:")
    for token, count in df_counter.most_common(10):
        print(f"{token}: {count}")


FileNotFoundError: [Errno 2] No such file or directory: 'description_index.json'

In [4]:
# DataLoader Class
class DataLoader:
    """Loads and processes data from JSON index files."""

    def __init__(self, data_dir=DATA_DIR):
        self.data_dir = data_dir
        self.df_counter = Counter()
        self.title_index = self.load_title_index()
        self.documents = self.load_description_index()
        self.avgdl = self.calculate_avgdl()
        self._precompute_embeddings()  # Precompute embeddings

    def _load_json(self, filename):
        """Loads a JSON file from the data directory."""
        filepath = os.path.join(self.data_dir, filename)
        if not os.path.exists(filepath):
            logging.warning(f"Warning: {filename} not found, returning empty dictionary.")
            return {}
        with open(filepath, "r", encoding="utf-8") as file:
            return json.load(file)

    def load_title_index(self):
        """Retrieves product titles from URLs or scrapes them if missing."""
        raw_data = self._load_json("description_index.json")
        title_mapping = {}

        for keyword, url_dict in raw_data.items():
            for url in url_dict.keys():
                product_id = extract_product_id(url)
                if product_id and product_id not in title_mapping:
                    title_mapping[product_id] = get_product_details(url)

        logging.info(f"Extracted {len(title_mapping)} product titles from URLs.")
        return title_mapping

    def load_description_index(self):
        """Loads descriptions and assigns real titles using the title index."""
        raw_data = self._load_json("description_index.json")
        structured_data = []
        title_cache = {}

        for keyword, url_dict in raw_data.items():
            for url, metadata in url_dict.items():
                product_id = extract_product_id(url)
                product_title, variant = get_product_details(url, title_cache)

                tokens = tokenize(product_title, variant, f"Contains keyword '{keyword}' (score: {metadata[0]}) {url}")

                self.df_counter.update(set(tokens))

                structured_data.append({
                    "id": hash(url),
                    "title": product_title,
                    "variant": variant,
                    "description": f"Contains keyword '{keyword}' (score: {metadata[0]})",
                    "url": url,
                    "tokens": tokens,
                })

        logging.info(f"Loaded {len(structured_data)} descriptions.")
        return structured_data

    def calculate_avgdl(self):
        """Computes the average document length."""
        total_length = sum(len(doc["tokens"]) for doc in self.documents)
        avgdl = total_length / len(self.documents) if self.documents else 0
        logging.info(f"Calculated average document length: {avgdl}")
        return avgdl

    def _precompute_embeddings(self):
        """Precompute embeddings for all documents in parallel."""
        with ThreadPoolExecutor() as executor:
            futures = {executor.submit(get_sentence_embedding, doc["title"], doc["variant"], doc["description"]): doc for doc in self.documents}
            
            for future in as_completed(futures):
                doc = futures[future]
                try:
                    doc["embedding"] = future.result()
                except Exception as e:
                    logging.error(f"Error computing embedding for {doc['url']}: {e}")
                    
DL=DataLoader()
documents=DL.load_description_index()
#print(documents)


2025-02-08 22:58:21,189 - INFO - Extracted 28 product titles from URLs.
2025-02-08 22:58:21,208 - INFO - 🔍 Scraping: https://web-scraping.dev/product/1
2025-02-08 22:58:22,105 - INFO - 🔍 Scraping: https://web-scraping.dev/product/11
2025-02-08 22:58:22,921 - INFO - 🔍 Scraping: https://web-scraping.dev/product/13?variant=cherry-large
2025-02-08 22:58:23,724 - INFO - 🔍 Scraping: https://web-scraping.dev/product/23
2025-02-08 22:58:24,557 - INFO - 🔍 Scraping: https://web-scraping.dev/product/25
2025-02-08 22:58:25,369 - INFO - 🔍 Scraping: https://web-scraping.dev/product/12
2025-02-08 22:58:26,204 - INFO - 🔍 Scraping: https://web-scraping.dev/product/20
2025-02-08 22:58:27,001 - INFO - 🔍 Scraping: https://web-scraping.dev/product/24
2025-02-08 22:58:27,820 - INFO - 🔍 Scraping: https://web-scraping.dev/product/8?variant=beige-6
2025-02-08 22:58:28,786 - INFO - 🔍 Scraping: https://web-scraping.dev/product/10
2025-02-08 22:58:29,598 - INFO - 🔍 Scraping: https://web-scraping.dev/product/22
20