In [23]:
import time
import os

import requests # used for non-javascript sites
from selenium import webdriver # used for javascript sites
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import bs4
from collections import Counter 

import json

### extracting html

In [24]:
website = "https://example.com"

def get_html(website):
    html = requests.get(website).text

    if html and "JavaScript" in html: # most frameworks will say "JS is required"
        options = webdriver.ChromeOptions()
        options.add_argument('--headless=new')
        driver = webdriver.Chrome(options=options)
        driver.get(website)
        
        time.sleep(30) # wait for any js elements to load

        html = driver.page_source
        driver.quit()

    return html

html = get_html(website)
print(html)

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

### parsing html


In [25]:
soup = bs4.BeautifulSoup(html, 'html.parser')

class_counter = Counter()

for element in soup.find_all(class_=True):
    for class_name in element['class']: 
        class_counter[class_name] += 1 

quantity_counter = Counter()
for class_name, count in class_counter.most_common(20): 
    quantity_counter[count] += 1
    
data_classes = []
for count in quantity_counter.most_common(1):
    data_classes.extend([class_name for class_name, c in class_counter.items() if c == count[0]])

print("Data-related classes:", data_classes)

def is_top_level_class(element, class_names):
    for parent in element.parents:
        if hasattr(parent, 'get') and parent.get('class'): # a bs4 element will have a get method
            if any(cls in parent.get('class', []) for cls in class_names):
                return False
    return True

filtered_elements = [
    el for el in soup.find_all(class_=data_classes)
    if is_top_level_class(el, data_classes)
]

elements = filtered_elements if len(filtered_elements) > 0 else [soup.body]


Data-related classes: []


### converting html to json

In [None]:
import mistral

html_elements = "\n".join(str(el) for el in elements)
chat_response = mistral.chat(html_elements).choices[0].message.content

def remove_unwanted_text(response):
    # most LLM's will return json as a code block string
    if "```json" in response:
        response = response.split("```json")[1]
        response = response.split("```")[0]
    return response

In [27]:
json_data = json.loads(remove_unwanted_text(chat_response))

In [28]:
with open("data.json", "w") as f:
    json.dump(json_data, f, indent=2)