In [1]:
from bs4 import BeautifulSoup


def flatten_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    parent_tags = {'div', 'form', 'section', 'article', 'nav'}
    child_tags = {'p', 'a', 'h1', 'h2', 'h3', 'h4', 'h5'}
    redundant_tags = {'meta', 'head', 'footer', 'style', 'script', 'noscript', 'img', 'svg', 'link', 'iframe', 'i'}

    for tag in soup.find_all(redundant_tags):
        tag.decompose()

    for tag in soup.find_all(parent_tags | child_tags):
        while tag.parent.name in parent_tags:
            tag = tag.parent

        if tag.name in parent_tags:
            tag.unwrap()

    return soup.prettify()



In [2]:
import requests
news_html = requests.get("https://nabzebourse.com/").text

In [3]:
with open("news.html", "w") as f:
    f.write(news_html)

In [4]:
flattened = flatten_html(news_html)
with open("flattened.html", "w") as f:
    f.write(flattened)

In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

In [6]:
from openai import OpenAI
client = OpenAI()

system_prompt = """
You are an assistant of a web scrapper. Your job is to extract scrapping instructions from a HTML web page to help the scrapper work faster.
You should attention to the important HTML tags that may include:
- Important texts related to the given keywords (you will be given the keywords).
- Links which are refreneces to important pages that may include important and related content.
- Buttons of pagination.

Your primary job is to find the pattern of important tags and tell the scrapper to what to do with those group of tags.
The scrapper only needs to gather data which is related to the given keywords.

The output should be only a JSON that includes:
- ref: the xpath to select all tags which have a common structure and are in the same context.
- tag_name: the name of the specified tag.
- action: the action that should be done on that group of tags (eg. click, extract, etc).

Return only JSON with no additional text.
------------
Exampe 1:
HTML Source:
<html>
</body>
<div>
<h1>Title</h1>
<p>This is a test.</p>
</div>
</body>
</html>
Instruction JSON:
[
  {
    "ref": "/div/h1[1]",
    "tag_name": "h1",
    "action": "extract"
  },
  {
    "ref": "/div/p[1]",
    "tag_name": "p",
    "action": "extract"
  }
]
------------
Exampe 2:
HTML Source:
<html>
</body>
<div>
<h2>OpenAI annouced its new model.</h2>
</a href="...">See blog post</a>
<h2>Microsoft Copilot released.</h2>
</a href="...">See blog post</a>
</div>
</body>
</html>
Instruction JSON:
[
  {
    "ref": "/div/h2",
    "tag_name": "h2",
    "action": "extract"
  },
  {
    "ref": "/div/a",
    "tag_name": "a",
    "action": "click"
  }
]
"""

user_prompt = """
Primary Website Language: {lang}
Keywords: {keywords}
HTML Source: {html}
"""

def gen_scrapper_instructions(lang, keywords, html):
  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": system_prompt,
          },
          {
              "role": "user",
              "content": user_prompt.format(lang=lang, keywords=keywords, html=html),
          }
      ],
      temperature=0,
      model="gpt-4-turbo",
  )
  return chat_completion.choices[0].message.content


In [8]:
instructions = gen_scrapper_instructions(
    lang="Farsi (Persian)",
    keywords="بورس، بازار، سهام، ارز، اخبار بورس، اخبار بازار",
    html=flattened,
)
print(instructions)

[
  {
    "ref": "//h4/a[contains(@href, '/fa/tags')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//h2/a[contains(@title, 'بورس') or contains(@title, 'سهام') or contains(@title, 'ارز')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//a[contains(@href, '/fa/news') and (contains(@title, 'بورس') or contains(@title, 'سهام') or contains(@title, 'ارز'))]",
    "tag_name": "a",
    "action": "click"
  }
]


In [14]:
def longest_common_prefix(strs):
    if not strs:
        return ""

    strs.sort()

    first_str = strs[0]
    last_str = strs[-1]

    prefix = ""
    for i in range(min(len(first_str), len(last_str))):
        if first_str[i] == last_str[i]:
            prefix += first_str[i]
        else:
            break
    
    return prefix

In [23]:
import json
from lxml import etree

def apply_instrctions(html, instructions):
    data = json.loads(instructions)
    dom = etree.HTML(flattened)
    common_prefixes = set()

    for instruction in data:
        print("ref:", instruction["ref"])
        elements = dom.xpath(instruction["ref"])
        for element in elements:
            print(element.text, element.attrib)
            # if instruction["tag_name"] == "a":
            #     links = [element.attrib["href"] for element in elements]
            #     print(links)
            #     common_prefix = longest_common_prefix(links)
            #     print(common_prefix)
            #     common_prefixes.add(common_prefix)
  
    # print(common_prefixes)

In [22]:
# /fa/tags/4265/1/%D8%B5%D8%AF%D8%A7%DB%8C-%D8%B3%D9%87%D8%A7%D9%85%D8%AF%D8%A7%D8%B1
page2_html = requests.get("https://nabzebourse.com/fa/tags/4265/1/%D8%B5%D8%AF%D8%A7%DB%8C-%D8%B3%D9%87%D8%A7%D9%85%D8%AF%D8%A7%D8%B1").text
page2_flattened = flatten_html(page2_html)
page2_instructions = gen_scrapper_instructions(
    lang="Farsi (Persian)",
    keywords="بورس، بازار، سهام، ارز، اخبار بورس، اخبار بازار",
    html=page2_flattened,
)
print(page2_instructions)

[
  {
    "ref": "//header//h4/a[contains(@href, 'tags')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//header//h2/a[contains(@href, 'news')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//ul[@class='header-main_top-menu_ul']//a[contains(@href, 'fa')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//a[contains(@href, '/fa/tags') and not(contains(@href, 'redirect'))]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//a[contains(@href, '/fa/news') and not(contains(@href, 'redirect'))]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//a[contains(@href, 'صدای-سهامدار')]",
    "tag_name": "a",
    "action": "click"
  },
  {
    "ref": "//a[contains(@href, '/fa/tags/4265') and not(contains(@href, 'redirect'))]",
    "tag_name": "a",
    "action": "click"
  }
]


In [24]:
apply_instrctions(page2_flattened, page2_instructions)

ref: //header//h4/a[contains(@href, 'tags')]

     بورس آباد
     {'href': 'https://nabzebourse.com/fa/tags/10170/1/%D8%A8%D9%88%D8%B1%D8%B3-%D8%A2%D8%A8%D8%A7%D8%AF', 'target': '_blank'}

     سهام عدالت
     {'href': 'https://nabzebourse.com/fa/tags/37/1/%D8%B3%D9%87%D8%A7%D9%85-%D8%B9%D8%AF%D8%A7%D9%84%D8%AA', 'target': '_blank'}

     عرضه اولیه
     {'href': 'https://nabzebourse.com/fa/tags/131/1/%D8%B9%D8%B1%D8%B6%D9%87-%D8%A7%D9%88%D9%84%DB%8C%D9%87', 'target': '_blank'}
ref: //header//h2/a[contains(@href, 'news')]

      {'href': '/fa/news/26991/currency', 'title': 'قیمت دلار طلا سکه ارز'}
ref: //ul[@class='header-main_top-menu_ul']//a[contains(@href, 'fa')]

     اخبار بورس
     {'class': 'header-main_top-menu_ul_li_a', 'href': '/fa/markets', 'id': '15'}

     اخبار اقتصادی
     {'class': 'header-main_top-menu_ul_li_a', 'href': '/fa/economy', 'id': '16'}

     اخبار ارز و سکه
     {'class': 'header-main_top-menu_ul_li_a', 'href': '/fa/currencies-news', 'id': '17'}

     تحلیل
